ImportWebURL.java example

Explorer
cogtool-master
/*******************************************************************************
 * CogTool Copyright Notice and Distribution Terms
 * CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University
 * This software is distributed under the terms of the FSF Lesser
 * Gnu Public License (see LGPL.txt). 
 * 
 * CogTool is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 * 
 * CogTool is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with CogTool; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 * 
 * CogTool makes use of several third-party components, with the 
 * following notices:
 * 
 * Eclipse SWT version 3.448
 * Eclipse GEF Draw2D version 3.2.1
 * 
 * Unless otherwise indicated, all Content made available by the Eclipse 
 * Foundation is provided to you under the terms and conditions of the Eclipse 
 * Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this 
 * Content and is also available at http://www.eclipse.org/legal/epl-v10.html.
 * 
 * CLISP version 2.38
 * 
 * Copyright (c) Sam Steingold, Bruno Haible 2001-2006
 * This software is distributed under the terms of the FSF Gnu Public License.
 * See COPYRIGHT file in clisp installation folder for more information.
 * 
 * ACT-R 6.0
 * 
 * Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere & 
 *                         John R Anderson. 
 * This software is distributed under the terms of the FSF Lesser
 * Gnu Public License (see LGPL.txt).
 * 
 * Apache Jakarta Commons-Lang 2.1
 * 
 * This product contains software developed by the Apache Software Foundation
 * (http://www.apache.org/)
 * 
 * jopt-simple version 1.0
 * 
 * Copyright (c) 2004-2013 Paul R. Holser, Jr.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 * Mozilla XULRunner 1.9.0.5
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/.
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * The J2SE(TM) Java Runtime Environment version 5.0
 * 
 * Copyright 2009 Sun Microsystems, Inc., 4150
 * Network Circle, Santa Clara, California 95054, U.S.A.  All
 * rights reserved. U.S.  
 * See the LICENSE file in the jre folder for more information.
 ******************************************************************************/

package edu.cmu.cs.hcii.cogtool.controller;

import java.io.ByteArrayOutputStream;
import java.io.File;

import org.eclipse.swt.SWT;
import org.eclipse.swt.SWTException;
import org.eclipse.swt.browser.Browser;
import org.eclipse.swt.browser.ProgressAdapter;
import org.eclipse.swt.browser.ProgressEvent;
import org.eclipse.swt.graphics.GC;
import org.eclipse.swt.graphics.Image;
import org.eclipse.swt.graphics.ImageData;
import org.eclipse.swt.graphics.ImageLoader;
import org.eclipse.swt.graphics.Point;
import org.eclipse.swt.graphics.Rectangle;
import org.eclipse.swt.layout.FillLayout;
import org.eclipse.swt.widgets.Shell;
import org.mozilla.interfaces.nsIDOMDocument;
import org.mozilla.interfaces.nsIDOMElement;
import org.mozilla.interfaces.nsIDOMLocation;
import org.mozilla.interfaces.nsIDOMNSHTMLElement;
import org.mozilla.interfaces.nsIDOMNode;
import org.mozilla.interfaces.nsIDOMNodeList;
import org.mozilla.interfaces.nsIDOMWindow;
import org.mozilla.interfaces.nsIDOMWindowInternal;
import org.mozilla.interfaces.nsIWebBrowser;

import edu.cmu.cs.hcii.cogtool.model.URLCrawlEntry;
import edu.cmu.cs.hcii.cogtool.model.URLPositionedLink;
import edu.cmu.cs.hcii.cogtool.util.AggregateException;
import edu.cmu.cs.hcii.cogtool.util.Cancelable;
import edu.cmu.cs.hcii.cogtool.util.OSUtils;
import edu.cmu.cs.hcii.cogtool.util.SynchronizedWait;
import edu.cmu.cs.hcii.cogtool.util.WindowUtil;

/**
 * Support for fetching and "crawling" a single URL using the Mozilla
 * XPCOM support.
 *
 * Documentation for installation:
 * http://www.eclipse.org/swt/faq.php#howusejavaxpcom
 *
 * The following is needed to link into CogTool itself:
 * http://releases.mozilla.org/pub/mozilla.org/xulrunner/releases/1.8.1.3/contrib/sdk/
 * (the .zip file will contain the .jar that should be shipped and loaded)
 *
 * There is a LICENSE document in the following but not the preceding .zip.
 * I do not know what this means for us.
 *
 * The following must be installed on the user's machine (just by unzipping)
 * http://releases.mozilla.org/pub/mozilla.org/xulrunner/releases/1.8.1.3/contrib/
 * (the .zip file will contain an executable for xulrunner; either this should
 * be "registered" (by executing it "xulrunner --register-user) or, perhaps,
 * we might be able to get away with setProperty calls, something like:
 * System.setProperty(XULRUNNER_PATH, <mozillaInstallPath>);
 * System.setProperty(XULRUNNER_INITIALIZED, "true"); ).
 * [see http://www.eclipse.org/atf/downloads/base_files/manualXulrunner_section.php]
 * [see also http://archive.netbsd.se/?ml=mozilla-dev-embedding&a=2007-03&t=3267662]
 * Or: System.setProperty("org.eclipse.swt.browser.XULRunnerPath", <stringPathToYourXULRunner1812install>);
 * as per: http://markmail.org/message/3akz4o3hxtktcmt3
 *
 * NOTE: If pages are fetched sequentially, the same ImportWebURL object
 * may be used; simply invoke fetchPage for each.
 *
 * Preparation involves:
 *  - Creating a Shell window
 *  - Creating a Browser instance
 *  - Opening the window to "receive" the URL's data
 *
 * Fetch occurs in three "phases":
 *  1. Setting the URL -- this must occur in the main thread or the XPCOM
 *     stuff doesn't work (see the scheduleAsynchronously call in fetchPage).
 *  2. Once the URL load has completed, the DOM is walked to fetch the links,
 *     their extents, target URL, and associated "label" text.   (See the
 *     ProgressListener assigned to the Browser instance in the constructor.)
 *  3. If images are requested, a separate request must be made to "grab"
 *     the Browser's image and convert it to PNG format, also in the main
 *     thread (see the CaptureBrowserImage instance variable).
 *
 * Since each of these must occur in the main thread, the child thread must
 * wait for both the second and third (if requested) phases to complete
 * before returning the fetched data.
 */
public class ImportWebURL implements ImportWebCrawler.IImportURL
{
    // These are accessed only in the main thread
    protected Shell window;
    protected Browser browser;

    protected boolean importImages;    // shared, never changes
    protected Cancelable cancelState; // internally synchronized

    protected ImportPageInfo urlPage;  // shared, changes TODO synchronize?

    // Accessed only within synchronized blocks
    protected AggregateException mainThreadExceptions =
        new AggregateException();

    /**
     * The state indicating whether the URL load/parse is complete.
     */
    protected SynchronizedWait parseComplete = new SynchronizedWait(false);

    /**
     * The state indicating whether the image capture is complete.
     */
    protected SynchronizedWait captureComplete = new SynchronizedWait(true);

    /**
     * For some reason, this does not work if placed in the ProgressListener.
     * Also, at the moment, it captures what the Display sees, not what
     * the actual Shell contents are, which means that if a window is on top
     * of the Browser Shell, its appearance is part of the resulting image!!!!
     */
    protected class CaptureBrowserImage implements Runnable
    {
        protected ImportPageInfo pageInfo;
        protected int imgX;
        protected int imgY;
        protected int imgWidth;
        protected int imgHeight;
        protected Browser urlBrowser;
        protected GC gc;
        protected ImageData[] loaderData = new ImageData[1];
        protected ImageLoader loader = new ImageLoader();

        public CaptureBrowserImage(Browser b)
        {
            urlBrowser = b;
            gc = new GC(urlBrowser);
            loader.data = loaderData;
        }

        public void dispose()
        {
            gc.dispose();
        }

        public void resetLink(ImportPageInfo info, int x, int y, int w, int h)
        {
            pageInfo = info;
            imgX = x;
            imgY = y;
            imgWidth = w;
            imgHeight = h;
        }

        /*
         * Compare, for example:
         * http://dev.eclipse.org/mhonarc/lists/platform-swt-dev/msg04846.html
         */

        public void run()
        {
            Image image = null;
            ByteArrayOutputStream out = null;

            // Executes in the main thread
            try {
                // If necessary, use the browser's width and height.
                // TODO: for now, always use the browser's width/height
//                if (this.imgWidth == 0) {
                if (true) {
                    Point extent = browser.getSize();

                    imgWidth = extent.x;
                    imgHeight = extent.y;
                }

                image = new Image(null, imgWidth, imgHeight);

                gc.copyArea(image, 0, 0);
                loaderData[0] = image.getImageData();

                out = new ByteArrayOutputStream(32768);
                loader.save(out, SWT.IMAGE_JPEG);

                pageInfo.background = out.toByteArray();

                Rectangle bkgBounds = image.getBounds();

                pageInfo.bkgImageX = imgX + bkgBounds.x;
                pageInfo.bkgImageY = imgY + bkgBounds.y;
                pageInfo.bkgImageWidth = bkgBounds.width;
                pageInfo.bkgImageHeight = bkgBounds.height;
            }
            catch(SWTException ex)
            {
            	System.out.println("widget is disposed");
            	return;
            }
            catch (Exception ex) {

                synchronized (mainThreadExceptions) {
                    mainThreadExceptions.addException(ex);
                }
            }
            finally {
                if (image != null) {
                    image.dispose();
                }
                if (out != null) {
                    try {
                        out.close();
                    }
                    catch (Exception ex) {
                        synchronized (mainThreadExceptions) {
                            mainThreadExceptions.addException(ex);
                        }
                    }
                }
                captureComplete.changeState(true);
            }
        }
    }

    // Accessed only in the main thread
    protected CaptureBrowserImage captureImage;

    protected nsIDOMElement asDOMElement(nsIDOMNode node)
    {
        if (node.getNodeType() == nsIDOMNode.ELEMENT_NODE) {
            return (nsIDOMElement)
                    node.queryInterface(nsIDOMElement.NS_IDOMELEMENT_IID);
        }

        return null;
    }

    protected nsIDOMNSHTMLElement asHTMLElement(nsIDOMElement e)
    {
        return (nsIDOMNSHTMLElement)
                e.queryInterface(nsIDOMNSHTMLElement.NS_IDOMNSHTMLELEMENT_IID);
    }

    protected static final String XULRUNNER_PATH =
        "org.eclipse.swt.browser.XULRunnerPath";

    /**
     * Constructor allows caller to specify whether background images are
     * desired and the width/height for the browser window.
     * Executed in the main thread.
     *
     * @param importImg whether to capture images
     * @param browserWidth width of browser window
     * @param browserHeight height of browser window
     * @param cancelable may be null; if not, controls whether the
     *                   process should be canceled.
     */
    public ImportWebURL(boolean importImg,
                        int browserWidth,
                        int browserHeight,
                        Cancelable cancelable)
    {
    	if (OSUtils.MACOSX) {
    		System.setProperty(XULRUNNER_PATH,
    		                   (new File("XUL-mac")).getAbsolutePath());
    	}
    	else {
    		System.setProperty(XULRUNNER_PATH,
    		                   (new File("XUL-win")).getAbsolutePath());
    	}

        importImages = importImg;
        cancelState = cancelable;

        window = new Shell(WindowUtil.GLOBAL_DISPLAY, SWT.NONE);
        window.setLocation(0, 120);
        window.setLayout(new FillLayout());
        window.setSize(browserWidth, browserHeight);

        browser =
            new Browser(window, SWT.MOZILLA | SWT.DOUBLE_BUFFERED);
        browser.setSize(browserWidth, browserHeight);

        captureImage = new CaptureBrowserImage(browser);

        // See: http://groups.google.com/group/mozilla.dev.embedding/browse_thread/thread/318404f35e13d46e
        browser.addProgressListener(new ProgressAdapter()
        {
            protected void union(URLPositionedLink info,
                                 double otherLeft,
                                 double otherTop,
                                 double otherWidth,
                                 double otherHeight)
            {
                double infoRight = info.left + info.width;
                double infoBottom = info.top + info.height;
                double otherRight = otherLeft + otherWidth;
                double otherBottom = otherTop + otherHeight;

                info.left = Math.min(info.left, otherLeft);
                info.top = Math.min(info.top, otherTop);

                info.width = Math.max(infoRight, otherRight) - info.left;
                info.height = Math.max(infoBottom, otherBottom) - info.top;
            }

            /**
             * Returns non-null if the node has an extent
             */
            protected nsIDOMElement getNodeExtent(nsIDOMNode node,
                                                  URLPositionedLink linkInfo)
            {
                nsIDOMElement elt = asDOMElement(node);

                if (elt != null) {
                    nsIDOMNSHTMLElement htmlElt = asHTMLElement(elt);

                    if (htmlElt != null) {
                        int left = htmlElt.getOffsetLeft();
                        int top = htmlElt.getOffsetTop();
                        int width = htmlElt.getOffsetWidth();
                        int height = htmlElt.getOffsetHeight();

                        nsIDOMElement offsetParent = htmlElt.getOffsetParent();

                        while (offsetParent != null) {
                            nsIDOMNSHTMLElement offsetHTMLParent =
                                asHTMLElement(offsetParent);

                            left += offsetHTMLParent.getOffsetLeft();
                            top += offsetHTMLParent.getOffsetTop();
                            offsetParent = offsetHTMLParent.getOffsetParent();
                        }

                        if (linkInfo.left == Double.MAX_VALUE) {
//System.out.print("SET ");
                            linkInfo.left = left;
                            linkInfo.top = top;
                            linkInfo.width = width;
                            linkInfo.height = height;
                        }
                        else {
//System.out.print("UNION ");
                            union(linkInfo,
                                  left,
                                  top,
                                  width,
                                  height);
                        }

// TODO should this be outside the if (elt != null) ?
                        nsIDOMNodeList children = node.getChildNodes();

                        if (children != null) {
                            long childCount = children.getLength();

                            for (long i = 0; i < childCount; i++) {
                                getNodeExtent(children.item(i), linkInfo);
                            }
                        }
                    }
                }

                return elt;
            }

            protected String getImageText(nsIDOMElement elt)
            {
                String title = elt.getAttribute("title");

                if ((title != null) && ! title.equals("")) {
                    return title;
                }

                return elt.getAttribute("alt");
            }

            protected String getNodeText(nsIDOMNode node)
            {
                if (node.getNodeType() == nsIDOMNode.TEXT_NODE) {
                    return node.getNodeValue();
                }

                nsIDOMElement elt = asDOMElement(node);

                if (elt != null) {
                    if ("IMG".equalsIgnoreCase(elt.getTagName())) {
                        return getImageText(elt);
                    }
                }

                StringBuilder childrenText = new StringBuilder();
                nsIDOMNodeList children = node.getChildNodes();

                if (children == null) {
                    return "";
                }

                long childCount = children.getLength();

                for (long i = 0; i < childCount; i++) {
                    String childText = getNodeText(children.item(i));

                    if (childText != null) {
                        childrenText.append(childText);
                    }
                }

                return childrenText.toString();
            }

            /*
             * Online documentation for these calls:
             * http://developer.mozilla.org/en/docs/Interfaces
             * http://www.xulplanet.com/references/xpcomref/group_MozillaSpecificDOM.html
             * http://www.xulplanet.com/references/xpcomref/group_W3CDOM.html
             * http://www.xulplanet.com/references/xpcomref/group_DOMHTML.html
             * Attributes are fetched using "bean"-like getters
             * (e.g., technically, "document" is an attribute of nsIDOMWindow,
             * so one accesses by calling "getDocument()").
             */
            @Override
            public void completed(ProgressEvent event)
            {
                // For some idiot reason, it appears that this gets invoked
                // whenever anything related to the current page completes
                // its loading.
                if (parseComplete.isTaskDone()) {
                    return;
                }

                // Executes in the main thread, I believe.
                try {
                    nsIWebBrowser domBrowser =
                        (nsIWebBrowser) browser.getWebBrowser();
                    nsIDOMWindow domWindow =
                        domBrowser.getContentDOMWindow();
                    nsIDOMDocument domDocument = domWindow.getDocument();
                    nsIDOMNodeList domLinks =
                        domDocument.getElementsByTagName("a");

                    long numLinks = domLinks.getLength();

                    for (long i = 0; i < numLinks; i++) {
                        URLPositionedLink linkInfo = new URLPositionedLink();

                        nsIDOMNode linkNode = domLinks.item(i);

                        linkInfo.setLabel(getNodeText(linkNode));

                        nsIDOMElement linkElt =
                            getNodeExtent(linkNode, linkInfo);

                        if (linkElt != null) {
                            linkInfo.setURL(linkElt.getAttribute("href"));

                            if (! linkInfo.isEmpty()) {
                                urlPage.links.add(linkInfo);
                            }
                        }
                    }

                    nsIDOMWindowInternal domWinI =
                        (nsIDOMWindowInternal)
                            domWindow.queryInterface(nsIDOMWindowInternal.NS_IDOMWINDOWINTERNAL_IID);
                    nsIDOMLocation domLoc = domWinI.getLocation();

                    urlPage.url = domLoc.getHref();

                    if (importImages) {
                        int docWidth = 0;
                        int docHeight = 0;

                        if (domWinI != null) {
                            docWidth = domWinI.getInnerWidth()
                                              + domWinI.getScrollMaxX();
                            docHeight = domWinI.getInnerHeight()
                                              + domWinI.getScrollMaxY();
                        }

                        captureImage.resetLink(urlPage,
                                               domWindow.getScrollX(),
                                               domWindow.getScrollY(),
                                               docWidth,
                                               docHeight);

                        captureComplete.changeState(false);

                        WindowUtil.scheduleAsynchronously(captureImage);
                    }
                }
                catch (Exception ex) {
                    synchronized (mainThreadExceptions) {
                        mainThreadExceptions.addException(ex);
                    }
                }
                finally {
                    parseComplete.changeState(true);
                }
            }
        });

        window.open();
    }

    /**
     * Use the instance's Browser to fetch the given URL and glean contained
     * link and image information from it.  It is assumed that this method
     * will be invoked in a child thread.
     *
     * @param entry the URL to crawl
     * @return the page information
     */

    public ImportPageInfo fetchPage(URLCrawlEntry entry)
    {
        parseComplete.changeState(false);

        // Must set this before the setURL on the browser since we are
        // in a child thread.
        final String entryURL = entry.getURL();

        urlPage = new ImportPageInfo(entryURL);

        // The setUrl call must be invoked in the main SWT thread.
        WindowUtil.scheduleAsynchronously(new Runnable() {

            public void run()
            {
                try {
                    if (! browser.setUrl(entryURL)) {
                        synchronized (mainThreadExceptions) {
                            mainThreadExceptions.addException(new IllegalStateException("setUrl returned false for: " + entryURL));
                        }
                        parseComplete.changeState(true);
                    }
                }
                catch (Exception ex) {
                    // Presumably, an exception here means that the setUrl
                    // failed and won't attempt to do any work
                    synchronized (mainThreadExceptions) {
                        mainThreadExceptions.addException(ex);
                    }

                    parseComplete.changeState(true);
                }
            }
        });

        // Now, we must wait until the URL has been loaded and,
        // if requested, the image data as well.
        // Note: it is also possible to get through the "if" part if
        //       an exception was thrown by the setUrl call.
        if (parseComplete.waitUntilDone(cancelState)) {
            boolean exceptionsThrown = false;

            // At this point, either the setUrl threw an exception or
            // the page load is complete.  If no exceptions were thrown
            // (by either), wait for the image capture to finish if requested.
            synchronized (mainThreadExceptions) {
                exceptionsThrown =
                    mainThreadExceptions.containsExceptions();
            }

            // TODO: Since this object is shared, we have no way of knowing
            // which URL caused the errors; thus, we'll just abort here.
            if (exceptionsThrown) {
                throw mainThreadExceptions;
            }

            if ((! exceptionsThrown) &&
                ((! importImages) ||
                 captureComplete.waitUntilDone(cancelState)))
            {
                // We have to check again in case the image capture
                // (if requested) threw any exceptions.
                synchronized (mainThreadExceptions) {
                    exceptionsThrown =
                        mainThreadExceptions.containsExceptions();
                }

                // TODO: See above.
                if (exceptionsThrown) {
                    throw mainThreadExceptions;
                }

                if (! exceptionsThrown) {
                    return urlPage;
                }
            }
        }

        return null;
    } // fetchPage


    public AggregateException getThrownExceptions()
    {
        return mainThreadExceptions;
    }


    public void dispose()
    {
        captureImage.dispose();
        browser.dispose();
        window.dispose();
    }
}