/******************************************************************************* * CogTool Copyright Notice and Distribution Terms * CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * CogTool is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * CogTool is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with CogTool; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * CogTool makes use of several third-party components, with the * following notices: * * Eclipse SWT version 3.448 * Eclipse GEF Draw2D version 3.2.1 * * Unless otherwise indicated, all Content made available by the Eclipse * Foundation is provided to you under the terms and conditions of the Eclipse * Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this * Content and is also available at http://www.eclipse.org/legal/epl-v10.html. * * CLISP version 2.38 * * Copyright (c) Sam Steingold, Bruno Haible 2001-2006 * This software is distributed under the terms of the FSF Gnu Public License. * See COPYRIGHT file in clisp installation folder for more information. * * ACT-R 6.0 * * Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere & * John R Anderson. * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * Apache Jakarta Commons-Lang 2.1 * * This product contains software developed by the Apache Software Foundation * (http://www.apache.org/) * * jopt-simple version 1.0 * * Copyright (c) 2004-2013 Paul R. Holser, Jr. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * Mozilla XULRunner 1.9.0.5 * * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/. * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the * License for the specific language governing rights and limitations * under the License. * * The J2SE(TM) Java Runtime Environment version 5.0 * * Copyright 2009 Sun Microsystems, Inc., 4150 * Network Circle, Santa Clara, California 95054, U.S.A. All * rights reserved. U.S. * See the LICENSE file in the jre folder for more information. ******************************************************************************/ package edu.cmu.cs.hcii.cogtool.controller; import java.io.ByteArrayOutputStream; import java.io.File; import org.eclipse.swt.SWT; import org.eclipse.swt.SWTException; import org.eclipse.swt.browser.Browser; import org.eclipse.swt.browser.ProgressAdapter; import org.eclipse.swt.browser.ProgressEvent; import org.eclipse.swt.graphics.GC; import org.eclipse.swt.graphics.Image; import org.eclipse.swt.graphics.ImageData; import org.eclipse.swt.graphics.ImageLoader; import org.eclipse.swt.graphics.Point; import org.eclipse.swt.graphics.Rectangle; import org.eclipse.swt.layout.FillLayout; import org.eclipse.swt.widgets.Shell; import org.mozilla.interfaces.nsIDOMDocument; import org.mozilla.interfaces.nsIDOMElement; import org.mozilla.interfaces.nsIDOMLocation; import org.mozilla.interfaces.nsIDOMNSHTMLElement; import org.mozilla.interfaces.nsIDOMNode; import org.mozilla.interfaces.nsIDOMNodeList; import org.mozilla.interfaces.nsIDOMWindow; import org.mozilla.interfaces.nsIDOMWindowInternal; import org.mozilla.interfaces.nsIWebBrowser; import edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry; import edu.cmu.cs.hcii.cogtool.model.URLPositionedLink; import edu.cmu.cs.hcii.cogtool.util.AggregateException; import edu.cmu.cs.hcii.cogtool.util.Cancelable; import edu.cmu.cs.hcii.cogtool.util.OSUtils; import edu.cmu.cs.hcii.cogtool.util.SynchronizedWait; import edu.cmu.cs.hcii.cogtool.util.WindowUtil; /** * Support for fetching and "crawling" a single URL using the Mozilla * XPCOM support. * * Documentation for installation: * http://www.eclipse.org/swt/faq.php#howusejavaxpcom * * The following is needed to link into CogTool itself: * http://releases.mozilla.org/pub/mozilla.org/xulrunner/releases/1.8.1.3/contrib/sdk/ * (the .zip file will contain the .jar that should be shipped and loaded) * * There is a LICENSE document in the following but not the preceding .zip. * I do not know what this means for us. * * The following must be installed on the user's machine (just by unzipping) * http://releases.mozilla.org/pub/mozilla.org/xulrunner/releases/1.8.1.3/contrib/ * (the .zip file will contain an executable for xulrunner; either this should * be "registered" (by executing it "xulrunner --register-user) or, perhaps, * we might be able to get away with setProperty calls, something like: * System.setProperty(XULRUNNER_PATH, <mozillaInstallPath>); * System.setProperty(XULRUNNER_INITIALIZED, "true"); ). * [see http://www.eclipse.org/atf/downloads/base_files/manualXulrunner_section.php] * [see also http://archive.netbsd.se/?ml=mozilla-dev-embedding&a=2007-03&t=3267662] * Or: System.setProperty("org.eclipse.swt.browser.XULRunnerPath", <stringPathToYourXULRunner1812install>); * as per: http://markmail.org/message/3akz4o3hxtktcmt3 * * NOTE: If pages are fetched sequentially, the same ImportWebURL object * may be used; simply invoke fetchPage for each. * * Preparation involves: * - Creating a Shell window * - Creating a Browser instance * - Opening the window to "receive" the URL's data * * Fetch occurs in three "phases": * 1. Setting the URL -- this must occur in the main thread or the XPCOM * stuff doesn't work (see the scheduleAsynchronously call in fetchPage). * 2. Once the URL load has completed, the DOM is walked to fetch the links, * their extents, target URL, and associated "label" text. (See the * ProgressListener assigned to the Browser instance in the constructor.) * 3. If images are requested, a separate request must be made to "grab" * the Browser's image and convert it to PNG format, also in the main * thread (see the CaptureBrowserImage instance variable). * * Since each of these must occur in the main thread, the child thread must * wait for both the second and third (if requested) phases to complete * before returning the fetched data. */ public class ImportWebURL implements ImportWebCrawler.IImportURL { // These are accessed only in the main thread protected Shell window; protected Browser browser; protected boolean importImages; // shared, never changes protected Cancelable cancelState; // internally synchronized protected ImportPageInfo urlPage; // shared, changes TODO synchronize? // Accessed only within synchronized blocks protected AggregateException mainThreadExceptions = new AggregateException(); /** * The state indicating whether the URL load/parse is complete. */ protected SynchronizedWait parseComplete = new SynchronizedWait(false); /** * The state indicating whether the image capture is complete. */ protected SynchronizedWait captureComplete = new SynchronizedWait(true); /** * For some reason, this does not work if placed in the ProgressListener. * Also, at the moment, it captures what the Display sees, not what * the actual Shell contents are, which means that if a window is on top * of the Browser Shell, its appearance is part of the resulting image!!!! */ protected class CaptureBrowserImage implements Runnable { protected ImportPageInfo pageInfo; protected int imgX; protected int imgY; protected int imgWidth; protected int imgHeight; protected Browser urlBrowser; protected GC gc; protected ImageData[] loaderData = new ImageData[1]; protected ImageLoader loader = new ImageLoader(); public CaptureBrowserImage(Browser b) { urlBrowser = b; gc = new GC(urlBrowser); loader.data = loaderData; } public void dispose() { gc.dispose(); } public void resetLink(ImportPageInfo info, int x, int y, int w, int h) { pageInfo = info; imgX = x; imgY = y; imgWidth = w; imgHeight = h; } /* * Compare, for example: * http://dev.eclipse.org/mhonarc/lists/platform-swt-dev/msg04846.html */ public void run() { Image image = null; ByteArrayOutputStream out = null; // Executes in the main thread try { // If necessary, use the browser's width and height. // TODO: for now, always use the browser's width/height // if (this.imgWidth == 0) { if (true) { Point extent = browser.getSize(); imgWidth = extent.x; imgHeight = extent.y; } image = new Image(null, imgWidth, imgHeight); gc.copyArea(image, 0, 0); loaderData[0] = image.getImageData(); out = new ByteArrayOutputStream(32768); loader.save(out, SWT.IMAGE_JPEG); pageInfo.background = out.toByteArray(); Rectangle bkgBounds = image.getBounds(); pageInfo.bkgImageX = imgX + bkgBounds.x; pageInfo.bkgImageY = imgY + bkgBounds.y; pageInfo.bkgImageWidth = bkgBounds.width; pageInfo.bkgImageHeight = bkgBounds.height; } catch(SWTException ex) { System.out.println("widget is disposed"); return; } catch (Exception ex) { synchronized (mainThreadExceptions) { mainThreadExceptions.addException(ex); } } finally { if (image != null) { image.dispose(); } if (out != null) { try { out.close(); } catch (Exception ex) { synchronized (mainThreadExceptions) { mainThreadExceptions.addException(ex); } } } captureComplete.changeState(true); } } } // Accessed only in the main thread protected CaptureBrowserImage captureImage; protected nsIDOMElement asDOMElement(nsIDOMNode node) { if (node.getNodeType() == nsIDOMNode.ELEMENT_NODE) { return (nsIDOMElement) node.queryInterface(nsIDOMElement.NS_IDOMELEMENT_IID); } return null; } protected nsIDOMNSHTMLElement asHTMLElement(nsIDOMElement e) { return (nsIDOMNSHTMLElement) e.queryInterface(nsIDOMNSHTMLElement.NS_IDOMNSHTMLELEMENT_IID); } protected static final String XULRUNNER_PATH = "org.eclipse.swt.browser.XULRunnerPath"; /** * Constructor allows caller to specify whether background images are * desired and the width/height for the browser window. * Executed in the main thread. * * @param importImg whether to capture images * @param browserWidth width of browser window * @param browserHeight height of browser window * @param cancelable may be null; if not, controls whether the * process should be canceled. */ public ImportWebURL(boolean importImg, int browserWidth, int browserHeight, Cancelable cancelable) { if (OSUtils.MACOSX) { System.setProperty(XULRUNNER_PATH, (new File("XUL-mac")).getAbsolutePath()); } else { System.setProperty(XULRUNNER_PATH, (new File("XUL-win")).getAbsolutePath()); } importImages = importImg; cancelState = cancelable; window = new Shell(WindowUtil.GLOBAL_DISPLAY, SWT.NONE); window.setLocation(0, 120); window.setLayout(new FillLayout()); window.setSize(browserWidth, browserHeight); browser = new Browser(window, SWT.MOZILLA | SWT.DOUBLE_BUFFERED); browser.setSize(browserWidth, browserHeight); captureImage = new CaptureBrowserImage(browser); // See: http://groups.google.com/group/mozilla.dev.embedding/browse_thread/thread/318404f35e13d46e browser.addProgressListener(new ProgressAdapter() { protected void union(URLPositionedLink info, double otherLeft, double otherTop, double otherWidth, double otherHeight) { double infoRight = info.left + info.width; double infoBottom = info.top + info.height; double otherRight = otherLeft + otherWidth; double otherBottom = otherTop + otherHeight; info.left = Math.min(info.left, otherLeft); info.top = Math.min(info.top, otherTop); info.width = Math.max(infoRight, otherRight) - info.left; info.height = Math.max(infoBottom, otherBottom) - info.top; } /** * Returns non-null if the node has an extent */ protected nsIDOMElement getNodeExtent(nsIDOMNode node, URLPositionedLink linkInfo) { nsIDOMElement elt = asDOMElement(node); if (elt != null) { nsIDOMNSHTMLElement htmlElt = asHTMLElement(elt); if (htmlElt != null) { int left = htmlElt.getOffsetLeft(); int top = htmlElt.getOffsetTop(); int width = htmlElt.getOffsetWidth(); int height = htmlElt.getOffsetHeight(); nsIDOMElement offsetParent = htmlElt.getOffsetParent(); while (offsetParent != null) { nsIDOMNSHTMLElement offsetHTMLParent = asHTMLElement(offsetParent); left += offsetHTMLParent.getOffsetLeft(); top += offsetHTMLParent.getOffsetTop(); offsetParent = offsetHTMLParent.getOffsetParent(); } if (linkInfo.left == Double.MAX_VALUE) { //System.out.print("SET "); linkInfo.left = left; linkInfo.top = top; linkInfo.width = width; linkInfo.height = height; } else { //System.out.print("UNION "); union(linkInfo, left, top, width, height); } // TODO should this be outside the if (elt != null) ? nsIDOMNodeList children = node.getChildNodes(); if (children != null) { long childCount = children.getLength(); for (long i = 0; i < childCount; i++) { getNodeExtent(children.item(i), linkInfo); } } } } return elt; } protected String getImageText(nsIDOMElement elt) { String title = elt.getAttribute("title"); if ((title != null) && ! title.equals("")) { return title; } return elt.getAttribute("alt"); } protected String getNodeText(nsIDOMNode node) { if (node.getNodeType() == nsIDOMNode.TEXT_NODE) { return node.getNodeValue(); } nsIDOMElement elt = asDOMElement(node); if (elt != null) { if ("IMG".equalsIgnoreCase(elt.getTagName())) { return getImageText(elt); } } StringBuilder childrenText = new StringBuilder(); nsIDOMNodeList children = node.getChildNodes(); if (children == null) { return ""; } long childCount = children.getLength(); for (long i = 0; i < childCount; i++) { String childText = getNodeText(children.item(i)); if (childText != null) { childrenText.append(childText); } } return childrenText.toString(); } /* * Online documentation for these calls: * http://developer.mozilla.org/en/docs/Interfaces * http://www.xulplanet.com/references/xpcomref/group_MozillaSpecificDOM.html * http://www.xulplanet.com/references/xpcomref/group_W3CDOM.html * http://www.xulplanet.com/references/xpcomref/group_DOMHTML.html * Attributes are fetched using "bean"-like getters * (e.g., technically, "document" is an attribute of nsIDOMWindow, * so one accesses by calling "getDocument()"). */ @Override public void completed(ProgressEvent event) { // For some idiot reason, it appears that this gets invoked // whenever anything related to the current page completes // its loading. if (parseComplete.isTaskDone()) { return; } // Executes in the main thread, I believe. try { nsIWebBrowser domBrowser = (nsIWebBrowser) browser.getWebBrowser(); nsIDOMWindow domWindow = domBrowser.getContentDOMWindow(); nsIDOMDocument domDocument = domWindow.getDocument(); nsIDOMNodeList domLinks = domDocument.getElementsByTagName("a"); long numLinks = domLinks.getLength(); for (long i = 0; i < numLinks; i++) { URLPositionedLink linkInfo = new URLPositionedLink(); nsIDOMNode linkNode = domLinks.item(i); linkInfo.setLabel(getNodeText(linkNode)); nsIDOMElement linkElt = getNodeExtent(linkNode, linkInfo); if (linkElt != null) { linkInfo.setURL(linkElt.getAttribute("href")); if (! linkInfo.isEmpty()) { urlPage.links.add(linkInfo); } } } nsIDOMWindowInternal domWinI = (nsIDOMWindowInternal) domWindow.queryInterface(nsIDOMWindowInternal.NS_IDOMWINDOWINTERNAL_IID); nsIDOMLocation domLoc = domWinI.getLocation(); urlPage.url = domLoc.getHref(); if (importImages) { int docWidth = 0; int docHeight = 0; if (domWinI != null) { docWidth = domWinI.getInnerWidth() + domWinI.getScrollMaxX(); docHeight = domWinI.getInnerHeight() + domWinI.getScrollMaxY(); } captureImage.resetLink(urlPage, domWindow.getScrollX(), domWindow.getScrollY(), docWidth, docHeight); captureComplete.changeState(false); WindowUtil.scheduleAsynchronously(captureImage); } } catch (Exception ex) { synchronized (mainThreadExceptions) { mainThreadExceptions.addException(ex); } } finally { parseComplete.changeState(true); } } }); window.open(); } /** * Use the instance's Browser to fetch the given URL and glean contained * link and image information from it. It is assumed that this method * will be invoked in a child thread. * * @param entry the URL to crawl * @return the page information */ public ImportPageInfo fetchPage(URLCrawlEntry entry) { parseComplete.changeState(false); // Must set this before the setURL on the browser since we are // in a child thread. final String entryURL = entry.getURL(); urlPage = new ImportPageInfo(entryURL); // The setUrl call must be invoked in the main SWT thread. WindowUtil.scheduleAsynchronously(new Runnable() { public void run() { try { if (! browser.setUrl(entryURL)) { synchronized (mainThreadExceptions) { mainThreadExceptions.addException(new IllegalStateException("setUrl returned false for: " + entryURL)); } parseComplete.changeState(true); } } catch (Exception ex) { // Presumably, an exception here means that the setUrl // failed and won't attempt to do any work synchronized (mainThreadExceptions) { mainThreadExceptions.addException(ex); } parseComplete.changeState(true); } } }); // Now, we must wait until the URL has been loaded and, // if requested, the image data as well. // Note: it is also possible to get through the "if" part if // an exception was thrown by the setUrl call. if (parseComplete.waitUntilDone(cancelState)) { boolean exceptionsThrown = false; // At this point, either the setUrl threw an exception or // the page load is complete. If no exceptions were thrown // (by either), wait for the image capture to finish if requested. synchronized (mainThreadExceptions) { exceptionsThrown = mainThreadExceptions.containsExceptions(); } // TODO: Since this object is shared, we have no way of knowing // which URL caused the errors; thus, we'll just abort here. if (exceptionsThrown) { throw mainThreadExceptions; } if ((! exceptionsThrown) && ((! importImages) || captureComplete.waitUntilDone(cancelState))) { // We have to check again in case the image capture // (if requested) threw any exceptions. synchronized (mainThreadExceptions) { exceptionsThrown = mainThreadExceptions.containsExceptions(); } // TODO: See above. if (exceptionsThrown) { throw mainThreadExceptions; } if (! exceptionsThrown) { return urlPage; } } } return null; } // fetchPage public AggregateException getThrownExceptions() { return mainThreadExceptions; } public void dispose() { captureImage.dispose(); browser.dispose(); window.dispose(); } }