/******************************************************************************* * CogTool Copyright Notice and Distribution Terms * CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * CogTool is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * CogTool is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with CogTool; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * CogTool makes use of several third-party components, with the * following notices: * * Eclipse SWT version 3.448 * Eclipse GEF Draw2D version 3.2.1 * * Unless otherwise indicated, all Content made available by the Eclipse * Foundation is provided to you under the terms and conditions of the Eclipse * Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this * Content and is also available at http://www.eclipse.org/legal/epl-v10.html. * * CLISP version 2.38 * * Copyright (c) Sam Steingold, Bruno Haible 2001-2006 * This software is distributed under the terms of the FSF Gnu Public License. * See COPYRIGHT file in clisp installation folder for more information. * * ACT-R 6.0 * * Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere & * John R Anderson. * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * Apache Jakarta Commons-Lang 2.1 * * This product contains software developed by the Apache Software Foundation * (http://www.apache.org/) * * jopt-simple version 1.0 * * Copyright (c) 2004-2013 Paul R. Holser, Jr. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * Mozilla XULRunner 1.9.0.5 * * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/. * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the * License for the specific language governing rights and limitations * under the License. * * The J2SE(TM) Java Runtime Environment version 5.0 * * Copyright 2009 Sun Microsystems, Inc., 4150 * Network Circle, Santa Clara, California 95054, U.S.A. All * rights reserved. U.S. * See the LICENSE file in the jre folder for more information. ******************************************************************************/ package edu.cmu.cs.hcii.cogtool.controller; import java.util.Iterator; import java.util.List; import java.util.Set; import edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry; import edu.cmu.cs.hcii.cogtool.util.AggregateException; import edu.cmu.cs.hcii.cogtool.util.Cancelable; import edu.cmu.cs.hcii.cogtool.util.ProgressCallback; /** * Web crawler that uses the SWT Mozilla browser to visit each page, * allowing for a DOM walk to parse child links and, if requested, * acquiring the image of each page. It is expected that this crawler * will do its work (the crawlWeb invocation) in a child thread. */ public class ImportWebCrawler extends WebCrawler { /** * This class represents an algorithm that fetches a single * page with its child links, determining the page's background image * and location/extent and, for each link, its label and * the location/extent. */ public interface IImportURL { /** * When importing a page, we may also glean its image. */ public static class ImportPageInfo extends WebCrawler.PageInfo { public byte[] background = null; public int bkgImageX = 0; public int bkgImageY = 0; public int bkgImageWidth = 0; public int bkgImageHeight = 0; public ImportPageInfo(String pageURL) { super(pageURL); } } /** * Fetch the page */ public ImportPageInfo fetchPage(URLCrawlEntry entry); /** * Return any exceptions thrown by the last call to fetchPage */ public AggregateException getThrownExceptions(); /** * Recover any system resources */ public void dispose(); } protected Cancelable cancelState; protected ProgressCallback progressState; // Currently, we'll serialize URL fetches, thus only one instance. protected IImportURL importURL; protected Set<String> pruneURLs = null; /** * Initialize the web crawler. Must be invoked in the main thread. * * @param importAlgorithm the algorithm to use for importing each page * @param cancelable may be null; if not, allows user to cancel the process * @param progressCB may be null; if not, used to indicate which URL * is currently being visited */ public ImportWebCrawler(IImportURL importAlgorithm, Cancelable cancelable, ProgressCallback progressCB, Set<String> pruneURLSet) { cancelState = cancelable; progressState = progressCB; importURL = importAlgorithm; pruneURLs = pruneURLSet; } /** * In this version of adding URLs to the initial crawl queue, * if a set of URLs is given that have been fetched in a previous crawl, * then only allow URLs that are not in the to-be-pruned list. */ // TODO this appears never to be called, but I'm too chicken // to just excise it @Override public void addURLsToCrawl(List<URLCrawlEntry> urls) { if (pruneURLs != null) { Iterator<URLCrawlEntry> checkURLs = urls.iterator(); while (checkURLs.hasNext()) { URLCrawlEntry checkEntry = checkURLs.next(); //System.out.print("TO CRAWL? " + checkEntry.getURL()); if (! pruneURLs.contains(checkEntry.getURL())) { urlsToCrawl.add(checkEntry); //System.out.println(" YES!"); } // else System.out.println(" NO!!!!"); } } else { // No pruning; add the given URLs by the default behavior, // which just adds all of the given URLs. super.addURLsToCrawl(urls); } } /** * Override; uses the ICancelable instance to determine if the user * has canceled the process. */ @Override protected boolean crawlMayContinue() { if (cancelState != null) { return ! cancelState.isCanceled(); } return super.crawlMayContinue(); } /** * Reports on progress by indicating which URL is currently being * visited. * * First checks if the given URL entry has been seen in this pass. * If not, then if there is a progress "bar", it updates the notification. */ @Override protected boolean crawlNeeded(URLCrawlEntry entry) { if (super.crawlNeeded(entry)) { if (progressState != null) { progressState.updateProgress(0.0, entry.getURL()); } //System.out.println(" YES!"); return true; } //System.out.println(" NO!!!!"); return false; } /** * If there is a list of URLs to prune (i.e., URLs fetched by a previous * crawl), then ensure that the given link is not in the to-be-pruned set. */ @Override protected boolean shouldCrawlLink(URLCrawlEntry newLink) { if (super.shouldCrawlLink(newLink)) { return (pruneURLs == null) || ! pruneURLs.contains(newLink.getURL()); } return false; } /** * Delegates (and serializes) page fetches to ImportWebURL */ @Override protected PageInfo fetchPage(URLCrawlEntry entry) { return importURL.fetchPage(entry); } public AggregateException getThrownExceptions() { return importURL.getThrownExceptions(); } /** * Must be recovered when done in order to recover * Browser and Shell resources. */ public void dispose() { importURL.dispose(); } }