SimpleCocoonCrawlerImpl.java example

Explorer
cocoon-master
- cocoon-BRANCH_2_1_X
  - src
  - tools
    - src
      - anttasks
        DocumentCache.java
        ManifestToolTask.java
        PoolSetterTask.java
        SitemapTask.java
        XConfToolTask.java
      - loader
        Loader.java
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cocoon.components.crawler;

import org.apache.avalon.excalibur.pool.Recyclable;
import org.apache.avalon.framework.activity.Disposable;
import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.logger.AbstractLogEnabled;
import org.apache.cocoon.Constants;
import org.apache.commons.lang.StringUtils;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;

/**
 * A simple cocoon crawler.
 *
 * @author     <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
 * @version CVS $Id$
 */
public class SimpleCocoonCrawlerImpl extends AbstractLogEnabled
implements CocoonCrawler, Configurable, Disposable, Recyclable {

    /**
     * Config element name specifying expected link content-typ.
     * <p>
     *   Its value is <code>link-content-type</code>.
     * </p>
     */
    public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";

    /**
     * Default value of <code>link-content-type</code> configuration value.
     * <p>
     *   Its value is <code>application/x-cocoon-links</code>.
     * </p>
     */
    public final String LINK_CONTENT_TYPE_DEFAULT = Constants.LINK_CONTENT_TYPE;

    /**
     * Config element name specifying query-string appendend for requesting links
     * of an URL.
     * <p>
     *  Its value is <code>link-view-query</code>.
     * </p>
     */
    public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";

    /**
     * Default value of <code>link-view-query</code> configuration option.
     * <p>
     *   Its value is <code>?cocoon-view=links</code>.
     * </p>
     */
    public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";

    /**
     * Config element name specifying excluding regular expression pattern.
     * <p>
     *  Its value is <code>exclude</code>.
     * </p>
     */
    public final static String EXCLUDE_CONFIG = "exclude";

    /**
     * Config element name specifying including regular expression pattern.
     * <p>
     *  Its value is <code>include</code>.
     * </p>
     */
    public final static String INCLUDE_CONFIG = "include";

    /**
     * Config element name specifying http header value for user-Agent.
     * <p>
     *  Its value is <code>user-agent</code>.
     * </p>
     */
    public final static String USER_AGENT_CONFIG = "user-agent";

    /**
     * Default value of <code>user-agent</code> configuration option.
     * @see Constants#COMPLETE_NAME
     */
    public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;

    /**
     * Config element name specifying http header value for accept.
     * <p>
     *  Its value is <code>accept</code>.
     * </p>
     */
    public final static String ACCEPT_CONFIG = "accept";

    /**
     * Default value of <code>accept</code> configuration option.
     * <p>
     *   Its value is <code>* / *</code>
     * </p>
     */
    public final static String ACCEPT_DEFAULT = "*/*";

    private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
    private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
    private HashSet excludeCrawlingURL;
    private HashSet includeCrawlingURL;
    private String userAgent = USER_AGENT_DEFAULT;
    private String accept = ACCEPT_DEFAULT;
    private HashSet crawled;

    protected int depth;
    protected HashSet urlsToProcess;
    protected HashSet urlsNextDepth;

    /**
     * Constructor for the SimpleCocoonCrawlerImpl object
     */
    public SimpleCocoonCrawlerImpl() {
        // by default include everything
        includeCrawlingURL = null;
        // by default exclude common image patterns
        excludeCrawlingURL = null;
    }

    /**
     * Configure the crawler component.
     * <p>
     *  Configure can specify which URI to include, and which URI to exclude
     *  from crawling. You specify the patterns as regular expressions.
     * </p>
     * <p>
     *  Morover you can configure
     *  the required content-type of crawling request, and the
     *  query-string appended to each crawling request.
     * </p>
     * <pre><tt>
     * <include>.*\.html?</exclude> or <exclude>.*\.html?, .*\.xsp</exclude>
     * <exclude>.*\.gif</exclude> or <exclude>.*\.gif, .*\.jpe?g</exclude>
     * <link-content-type> application/x-cocoon-links </link-content-type>
     * <link-view-query> ?cocoon-view=links </link-view-query>
     * </tt></pre>
     *
     * @param  configuration               XML configuration of this avalon component.
     * @exception  ConfigurationException  is throwing if configuration is invalid.
     */
    public void configure(Configuration configuration)
    throws ConfigurationException {

        Configuration[] children;
        children = configuration.getChildren(INCLUDE_CONFIG);
        if (children.length > 0) {
            includeCrawlingURL = new HashSet();
            for (int i = 0; i < children.length; i++) {
                String pattern = children[i].getValue();
                try {
                    String params[] = StringUtils.split(pattern, ", ");
                    for (int index = 0; index < params.length; index++) {
                        String tokenized_pattern = params[index];
                        this.includeCrawlingURL.add(new RE(tokenized_pattern));
                    }
                } catch (RESyntaxException rese) {
                    getLogger().error("Cannot create including regular-expression for " +
                    pattern, rese);
                }
            }
        } else {
            if (getLogger().isDebugEnabled()) {
                getLogger().debug("Include all URLs");
            }
        }

        children = configuration.getChildren(EXCLUDE_CONFIG);
        if (children.length > 0) {
            excludeCrawlingURL = new HashSet();
            for (int i = 0; i < children.length; i++) {
                String pattern = children[i].getValue();
                try {
                    String params[] = StringUtils.split(pattern, ", ");
                    for (int index = 0; index < params.length; index++) {
                        String tokenized_pattern = params[index];
                        this.excludeCrawlingURL.add(new RE(tokenized_pattern));
                    }
                } catch (RESyntaxException rese) {
                    getLogger().error("Cannot create excluding regular-expression for " +
                    pattern, rese);
                }
            }
        } else {
            excludeCrawlingURL = new HashSet();
            setDefaultExcludeFromCrawling();
            if (getLogger().isDebugEnabled()) {
                getLogger().debug("Exclude default URLs only");
            }
        }

        Configuration child;
        String value;
        child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
        if (child != null) {
            value = child.getValue();
            if (value != null && value.length() > 0) {
                this.linkContentType = value.trim();
            }
        }
        child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
        if (child != null) {
            value = child.getValue();
            if (value != null && value.length() > 0) {
                this.linkViewQuery = value.trim();
            }
        }

        child = configuration.getChild(USER_AGENT_CONFIG, false);
        if (child != null) {
            value = child.getValue();
            if (value != null && value.length() > 0) {
                this.userAgent = value;
            }
        }

        child = configuration.getChild(ACCEPT_CONFIG, false);
        if (child != null) {
            value = child.getValue();
            if (value != null && value.length() > 0) {
                this.accept = value;
            }
        }
        
    }

    /**
     * dispose at end of life cycle, releasing all resources.
     */
    public void dispose() {
        crawled = null;
        urlsToProcess = null;
        urlsNextDepth = null;
        excludeCrawlingURL = null;
        includeCrawlingURL = null;
    }

    /**
     * recylcle this object, relasing resources
     */
    public void recycle() {
        crawled = null;
        urlsToProcess = null;
        urlsNextDepth = null;
        depth = -1;
    }

    /**
     * The same as calling crawl(url,-1);
     *
     * @param  url  Crawl this URL, getting all links from this URL.
     */
    public void crawl(URL url) {
        crawl(url, -1);
    }

    /**
     * Start crawling a URL.
     *
     * <p>
     *   Use this method to start crawling.
     *   Get the this url, and all its children  by using <code>iterator()</code>.
     *   The Iterator object will return URL objects.
     * </p>
     * <p>
     *  You may use the crawl(), and iterator() methods the following way:
     * </p>
     * <pre><tt>
     *   SimpleCocoonCrawlerImpl scci = ....;
     *   scci.crawl( "http://foo/bar" );
     *   Iterator i = scci.iterator();
     *   while (i.hasNext()) {
     *     URL url = (URL)i.next();
     *     ...
     *   }
     * </tt></pre>
     * <p>
     *   The i.next() method returns a URL, and calculates the links of the
     *   URL before return it.
     * </p>
     *
     * @param  url  Crawl this URL, getting all links from this URL.
     * @param  maxDepth  maximum depth to crawl to. -1 for no maximum.
     */
    public void crawl(URL url, int maxDepth) {
        crawled = new HashSet();
        urlsToProcess = new HashSet();
        urlsNextDepth = new HashSet();
        depth = maxDepth;

        if (getLogger().isDebugEnabled()) {
            getLogger().debug("crawl URL " + url + " to depth " + maxDepth);
        }

        urlsToProcess.add(url);
    }

    /**
     * Return iterator, iterating over all links of the currently crawled URL.
     * <p>
     *   The Iterator object will return URL objects at its <code>next()</code>
     *   method.
     * </p>
     *
     * @return    Iterator iterator of all links from the crawl URL.
     * @since
     */
    public Iterator iterator() {
        return new CocoonCrawlerIterator(this);
    }

    /**
     * Default exclude patterns.
     * <p>
     *   By default URLs matching following patterns are excluded:
     * </p>
     * <ul>
     *   <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
     *   <li>.*\\.png(\\?.*)?$ - exclude png images</li>
     *   <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
     *   <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
     *   <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
     * </ul>
     *
     * @since
     */
    private void setDefaultExcludeFromCrawling() {
        String[] EXCLUDE_FROM_CRAWLING_DEFAULT = {
            ".*\\.gif(\\?.*)?$",
            ".*\\.png(\\?.*)?$",
            ".*\\.jpe?g(\\?.*)?$",
            ".*\\.js(\\?.*)?$",
            ".*\\.css(\\?.*)?$"
        };

        for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
            String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
            try {
                excludeCrawlingURL.add(new RE(pattern));
            } catch (RESyntaxException rese) {
                getLogger().error("Cannot create excluding regular-expression for " +
                pattern, rese);
            }
        }
    }

    /**
     * Compute list of links from the url.
     * <p>
     *   Check for include, exclude pattern, content-type, and if url
     *   has been craweled already.
     * </p>
     *
     * @param  url  Crawl this URL
     * @return      List of URLs, which are links from url, asserting the conditions.
     * @since
     */
    private List getLinks(URL url) {
        ArrayList url_links = null;
        String sURL = url.toString();
        
        if (!isIncludedURL(sURL) || isExcludedURL(sURL)) {
            return null;
        }

        // don't try to get links for url which has been crawled already
        if (crawled.contains(sURL)) {
            return null;
        }

        // mark it as crawled
        crawled.add(sURL);

        // get links of url
        if (getLogger().isDebugEnabled()) {
            getLogger().debug("Getting links of URL " + sURL);
        }
        BufferedReader br = null;
        try {
            sURL = url.getFile();
            URL links = new URL(url, sURL
            + ((sURL.indexOf("?") == -1) ? "?" : "&")
            + linkViewQuery);
            URLConnection links_url_connection = links.openConnection();
            links_url_connection.setRequestProperty("Accept", accept);
            links_url_connection.setRequestProperty("User-Agent", userAgent);
            links_url_connection.connect();
            InputStream is = links_url_connection.getInputStream();
            br = new BufferedReader(new InputStreamReader(is));

            String contentType = links_url_connection.getContentType();
            if (contentType == null) {
                if (getLogger().isDebugEnabled()) {
                    getLogger().debug("Ignoring " + sURL + " (no content type)");
                }
                // there is a check on null in the calling method
                return null;
            }

            int index = contentType.indexOf(';');
            if (index != -1) {
                contentType = contentType.substring(0, index);
            }

            if (getLogger().isDebugEnabled()) {
                getLogger().debug("Content-type: " + contentType);
            }

            if (contentType.equals(linkContentType)) {
                url_links = new ArrayList();

                // content is supposed to be a list of links,
                // relative to current URL
                String line;
                while ((line = br.readLine()) != null) {
                    final URL newUrl = new URL(url, line);
                    final String sNewUrl = newUrl.toString();

                    boolean add_url = true;
                    // don't add new_url twice
                    if (add_url) {
                        add_url &= !url_links.contains(sNewUrl);
                    }

                    // don't add new_url if it has been crawled already
                    if (add_url) {
                        add_url &= !crawled.contains(sNewUrl);
                    }

                    // don't add if is not matched by existing include definition
                    if (add_url) {
                        add_url &= isIncludedURL(sNewUrl);
                    }

                    // don't add if is matched by existing exclude definition
                    if (add_url) {
                        add_url &= !isExcludedURL(sNewUrl);
                    }
                    if (add_url) {
                        if (getLogger().isDebugEnabled()) {
                            getLogger().debug("Add URL: " + sNewUrl);
                        }
                        url_links.add(newUrl);
                    }
                }
                // now we have a list of URL which should be examined
            }
        } catch (IOException ioe) {
            getLogger().warn("Problems get links of " + url, ioe);
        } finally {
            if (br != null) {
                try {
                    br.close();
                    br = null;
                } catch (IOException ignored) {
                }
            }
        }
        return url_links;
    }

    /**
     * check if URL is a candidate for indexing
     *
     * @param  url  the URL to check
     * @return      The excludedURL value
     */
    private boolean isExcludedURL(String url) {
        // by default do not exclude URL for crawling
        if (excludeCrawlingURL == null) {
            return false;
        }

        final String s = url;
        Iterator i = excludeCrawlingURL.iterator();
        while (i.hasNext()) {
            RE pattern = (RE) i.next();
            if (pattern.match(s)) {
                if (getLogger().isDebugEnabled()) {
                    getLogger().debug("Excluded URL " + url);
                }
                return true;
            }
        }
        if (getLogger().isDebugEnabled()) {
            getLogger().debug("Not excluded URL " + url);
        }
        return false;
    }

    /**
     * check if URL is a candidate for indexing
     *
     * @param  url  Description of Parameter
     * @return      The includedURL value
     */
    private boolean isIncludedURL(String url) {
        // by default include URL for crawling
        if (includeCrawlingURL == null) {
            return true;
        }

        final String s = url;
        Iterator i = includeCrawlingURL.iterator();
        while (i.hasNext()) {
            RE pattern = (RE) i.next();
            if (pattern.match(s)) {
                if (getLogger().isDebugEnabled()) {
                    getLogger().debug("Included URL " + url);
                }
                return true;
            }
        }
        if (getLogger().isDebugEnabled()) {
            getLogger().debug("Not included URL " + url);
        }
        return false;
    }

    /**
     * Helper class implementing an Iterator
     * <p>
     *   This Iterator implementation calculates the links of an URL
     *   before returning in the next() method.
     * </p>
     *
     * @author     <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
     * @version    $Id$
     */
    public static class CocoonCrawlerIterator implements Iterator {
        private SimpleCocoonCrawlerImpl cocoonCrawler;

        /**
         * Constructor for the CocoonCrawlerIterator object
         *
         * @param  cocoonCrawler  the containing CocoonCrawler instance.
         */
        CocoonCrawlerIterator(SimpleCocoonCrawlerImpl cocoonCrawler) {
            this.cocoonCrawler = cocoonCrawler;
        }

        /**
         * check if crawling is finished.
         *
         * @return    <code>true</code> if crawling has finished,
         * else <code>false</code>.
         */
        public boolean hasNext() {
            return cocoonCrawler.urlsToProcess.size() > 0
            || cocoonCrawler.urlsNextDepth.size() > 0;
        }

        /**
         * @return    the next URL
         */
        public Object next() {
            if (cocoonCrawler.urlsToProcess.size() == 0
            && cocoonCrawler.urlsNextDepth.size() > 0) {
                // process queued urls belonging to the next depth level
                cocoonCrawler.urlsToProcess = cocoonCrawler.urlsNextDepth;
                cocoonCrawler.urlsNextDepth = new HashSet();
                // fix Bugzilla Bug 25270
                // only decrease if depth > 0, excluding decreasing
                // if depth is already equal to -1
                if (cocoonCrawler.depth > 0) {
                    cocoonCrawler.depth--;
                }
            }
            URL theNextUrl = null;
            // fix Bugzilla Bug 25270
            // return NextUrl != null only if getLinks() returns non-null
            // list
            for (Iterator i = cocoonCrawler.urlsToProcess.iterator(); 
              i.hasNext() && theNextUrl == null;) {
                // fetch a URL
                URL url = (URL) i.next();

                // remove it from the to-do list
                i.remove();

                if (cocoonCrawler.depth == -1 || cocoonCrawler.depth > 0) {
                    // calc all links from this url
                    List url_links = cocoonCrawler.getLinks(url);
                    if (url_links != null) {
                        // add links of this url to the to-do list
                        cocoonCrawler.urlsNextDepth.addAll(url_links);
                        theNextUrl = url;
                    }
                }
            }
            // finally return url
            return theNextUrl;
        }

        /**
         * remove is not implemented
         */
        public void remove() {
            throw new UnsupportedOperationException("remove is not implemented");
        }
    }
}