/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cocoon.components.crawler; import org.apache.avalon.excalibur.pool.Recyclable; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.logger.AbstractLogEnabled; import org.apache.cocoon.Constants; import org.apache.commons.lang.StringUtils; import org.apache.regexp.RE; import org.apache.regexp.RESyntaxException; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; /** * A simple cocoon crawler. * * @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a> * @version CVS $Id$ */ public class SimpleCocoonCrawlerImpl extends AbstractLogEnabled implements CocoonCrawler, Configurable, Disposable, Recyclable { /** * Config element name specifying expected link content-typ. * <p> * Its value is <code>link-content-type</code>. * </p> */ public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type"; /** * Default value of <code>link-content-type</code> configuration value. * <p> * Its value is <code>application/x-cocoon-links</code>. * </p> */ public final String LINK_CONTENT_TYPE_DEFAULT = Constants.LINK_CONTENT_TYPE; /** * Config element name specifying query-string appendend for requesting links * of an URL. * <p> * Its value is <code>link-view-query</code>. * </p> */ public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query"; /** * Default value of <code>link-view-query</code> configuration option. * <p> * Its value is <code>?cocoon-view=links</code>. * </p> */ public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links"; /** * Config element name specifying excluding regular expression pattern. * <p> * Its value is <code>exclude</code>. * </p> */ public final static String EXCLUDE_CONFIG = "exclude"; /** * Config element name specifying including regular expression pattern. * <p> * Its value is <code>include</code>. * </p> */ public final static String INCLUDE_CONFIG = "include"; /** * Config element name specifying http header value for user-Agent. * <p> * Its value is <code>user-agent</code>. * </p> */ public final static String USER_AGENT_CONFIG = "user-agent"; /** * Default value of <code>user-agent</code> configuration option. * @see Constants#COMPLETE_NAME */ public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME; /** * Config element name specifying http header value for accept. * <p> * Its value is <code>accept</code>. * </p> */ public final static String ACCEPT_CONFIG = "accept"; /** * Default value of <code>accept</code> configuration option. * <p> * Its value is <code>* / *</code> * </p> */ public final static String ACCEPT_DEFAULT = "*/*"; private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT; private String linkContentType = LINK_CONTENT_TYPE_DEFAULT; private HashSet excludeCrawlingURL; private HashSet includeCrawlingURL; private String userAgent = USER_AGENT_DEFAULT; private String accept = ACCEPT_DEFAULT; private HashSet crawled; protected int depth; protected HashSet urlsToProcess; protected HashSet urlsNextDepth; /** * Constructor for the SimpleCocoonCrawlerImpl object */ public SimpleCocoonCrawlerImpl() { // by default include everything includeCrawlingURL = null; // by default exclude common image patterns excludeCrawlingURL = null; } /** * Configure the crawler component. * <p> * Configure can specify which URI to include, and which URI to exclude * from crawling. You specify the patterns as regular expressions. * </p> * <p> * Morover you can configure * the required content-type of crawling request, and the * query-string appended to each crawling request. * </p> * <pre><tt> * <include>.*\.html?</exclude> or <exclude>.*\.html?, .*\.xsp</exclude> * <exclude>.*\.gif</exclude> or <exclude>.*\.gif, .*\.jpe?g</exclude> * <link-content-type> application/x-cocoon-links </link-content-type> * <link-view-query> ?cocoon-view=links </link-view-query> * </tt></pre> * * @param configuration XML configuration of this avalon component. * @exception ConfigurationException is throwing if configuration is invalid. */ public void configure(Configuration configuration) throws ConfigurationException { Configuration[] children; children = configuration.getChildren(INCLUDE_CONFIG); if (children.length > 0) { includeCrawlingURL = new HashSet(); for (int i = 0; i < children.length; i++) { String pattern = children[i].getValue(); try { String params[] = StringUtils.split(pattern, ", "); for (int index = 0; index < params.length; index++) { String tokenized_pattern = params[index]; this.includeCrawlingURL.add(new RE(tokenized_pattern)); } } catch (RESyntaxException rese) { getLogger().error("Cannot create including regular-expression for " + pattern, rese); } } } else { if (getLogger().isDebugEnabled()) { getLogger().debug("Include all URLs"); } } children = configuration.getChildren(EXCLUDE_CONFIG); if (children.length > 0) { excludeCrawlingURL = new HashSet(); for (int i = 0; i < children.length; i++) { String pattern = children[i].getValue(); try { String params[] = StringUtils.split(pattern, ", "); for (int index = 0; index < params.length; index++) { String tokenized_pattern = params[index]; this.excludeCrawlingURL.add(new RE(tokenized_pattern)); } } catch (RESyntaxException rese) { getLogger().error("Cannot create excluding regular-expression for " + pattern, rese); } } } else { excludeCrawlingURL = new HashSet(); setDefaultExcludeFromCrawling(); if (getLogger().isDebugEnabled()) { getLogger().debug("Exclude default URLs only"); } } Configuration child; String value; child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false); if (child != null) { value = child.getValue(); if (value != null && value.length() > 0) { this.linkContentType = value.trim(); } } child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false); if (child != null) { value = child.getValue(); if (value != null && value.length() > 0) { this.linkViewQuery = value.trim(); } } child = configuration.getChild(USER_AGENT_CONFIG, false); if (child != null) { value = child.getValue(); if (value != null && value.length() > 0) { this.userAgent = value; } } child = configuration.getChild(ACCEPT_CONFIG, false); if (child != null) { value = child.getValue(); if (value != null && value.length() > 0) { this.accept = value; } } } /** * dispose at end of life cycle, releasing all resources. */ public void dispose() { crawled = null; urlsToProcess = null; urlsNextDepth = null; excludeCrawlingURL = null; includeCrawlingURL = null; } /** * recylcle this object, relasing resources */ public void recycle() { crawled = null; urlsToProcess = null; urlsNextDepth = null; depth = -1; } /** * The same as calling crawl(url,-1); * * @param url Crawl this URL, getting all links from this URL. */ public void crawl(URL url) { crawl(url, -1); } /** * Start crawling a URL. * * <p> * Use this method to start crawling. * Get the this url, and all its children by using <code>iterator()</code>. * The Iterator object will return URL objects. * </p> * <p> * You may use the crawl(), and iterator() methods the following way: * </p> * <pre><tt> * SimpleCocoonCrawlerImpl scci = ....; * scci.crawl( "http://foo/bar" ); * Iterator i = scci.iterator(); * while (i.hasNext()) { * URL url = (URL)i.next(); * ... * } * </tt></pre> * <p> * The i.next() method returns a URL, and calculates the links of the * URL before return it. * </p> * * @param url Crawl this URL, getting all links from this URL. * @param maxDepth maximum depth to crawl to. -1 for no maximum. */ public void crawl(URL url, int maxDepth) { crawled = new HashSet(); urlsToProcess = new HashSet(); urlsNextDepth = new HashSet(); depth = maxDepth; if (getLogger().isDebugEnabled()) { getLogger().debug("crawl URL " + url + " to depth " + maxDepth); } urlsToProcess.add(url); } /** * Return iterator, iterating over all links of the currently crawled URL. * <p> * The Iterator object will return URL objects at its <code>next()</code> * method. * </p> * * @return Iterator iterator of all links from the crawl URL. * @since */ public Iterator iterator() { return new CocoonCrawlerIterator(this); } /** * Default exclude patterns. * <p> * By default URLs matching following patterns are excluded: * </p> * <ul> * <li>.*\\.gif(\\?.*)?$ - exclude gif images</li> * <li>.*\\.png(\\?.*)?$ - exclude png images</li> * <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li> * <li>.*\\.js(\\?.*)?$ - exclude javascript </li> * <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li> * </ul> * * @since */ private void setDefaultExcludeFromCrawling() { String[] EXCLUDE_FROM_CRAWLING_DEFAULT = { ".*\\.gif(\\?.*)?$", ".*\\.png(\\?.*)?$", ".*\\.jpe?g(\\?.*)?$", ".*\\.js(\\?.*)?$", ".*\\.css(\\?.*)?$" }; for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) { String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i]; try { excludeCrawlingURL.add(new RE(pattern)); } catch (RESyntaxException rese) { getLogger().error("Cannot create excluding regular-expression for " + pattern, rese); } } } /** * Compute list of links from the url. * <p> * Check for include, exclude pattern, content-type, and if url * has been craweled already. * </p> * * @param url Crawl this URL * @return List of URLs, which are links from url, asserting the conditions. * @since */ private List getLinks(URL url) { ArrayList url_links = null; String sURL = url.toString(); if (!isIncludedURL(sURL) || isExcludedURL(sURL)) { return null; } // don't try to get links for url which has been crawled already if (crawled.contains(sURL)) { return null; } // mark it as crawled crawled.add(sURL); // get links of url if (getLogger().isDebugEnabled()) { getLogger().debug("Getting links of URL " + sURL); } BufferedReader br = null; try { sURL = url.getFile(); URL links = new URL(url, sURL + ((sURL.indexOf("?") == -1) ? "?" : "&") + linkViewQuery); URLConnection links_url_connection = links.openConnection(); links_url_connection.setRequestProperty("Accept", accept); links_url_connection.setRequestProperty("User-Agent", userAgent); links_url_connection.connect(); InputStream is = links_url_connection.getInputStream(); br = new BufferedReader(new InputStreamReader(is)); String contentType = links_url_connection.getContentType(); if (contentType == null) { if (getLogger().isDebugEnabled()) { getLogger().debug("Ignoring " + sURL + " (no content type)"); } // there is a check on null in the calling method return null; } int index = contentType.indexOf(';'); if (index != -1) { contentType = contentType.substring(0, index); } if (getLogger().isDebugEnabled()) { getLogger().debug("Content-type: " + contentType); } if (contentType.equals(linkContentType)) { url_links = new ArrayList(); // content is supposed to be a list of links, // relative to current URL String line; while ((line = br.readLine()) != null) { final URL newUrl = new URL(url, line); final String sNewUrl = newUrl.toString(); boolean add_url = true; // don't add new_url twice if (add_url) { add_url &= !url_links.contains(sNewUrl); } // don't add new_url if it has been crawled already if (add_url) { add_url &= !crawled.contains(sNewUrl); } // don't add if is not matched by existing include definition if (add_url) { add_url &= isIncludedURL(sNewUrl); } // don't add if is matched by existing exclude definition if (add_url) { add_url &= !isExcludedURL(sNewUrl); } if (add_url) { if (getLogger().isDebugEnabled()) { getLogger().debug("Add URL: " + sNewUrl); } url_links.add(newUrl); } } // now we have a list of URL which should be examined } } catch (IOException ioe) { getLogger().warn("Problems get links of " + url, ioe); } finally { if (br != null) { try { br.close(); br = null; } catch (IOException ignored) { } } } return url_links; } /** * check if URL is a candidate for indexing * * @param url the URL to check * @return The excludedURL value */ private boolean isExcludedURL(String url) { // by default do not exclude URL for crawling if (excludeCrawlingURL == null) { return false; } final String s = url; Iterator i = excludeCrawlingURL.iterator(); while (i.hasNext()) { RE pattern = (RE) i.next(); if (pattern.match(s)) { if (getLogger().isDebugEnabled()) { getLogger().debug("Excluded URL " + url); } return true; } } if (getLogger().isDebugEnabled()) { getLogger().debug("Not excluded URL " + url); } return false; } /** * check if URL is a candidate for indexing * * @param url Description of Parameter * @return The includedURL value */ private boolean isIncludedURL(String url) { // by default include URL for crawling if (includeCrawlingURL == null) { return true; } final String s = url; Iterator i = includeCrawlingURL.iterator(); while (i.hasNext()) { RE pattern = (RE) i.next(); if (pattern.match(s)) { if (getLogger().isDebugEnabled()) { getLogger().debug("Included URL " + url); } return true; } } if (getLogger().isDebugEnabled()) { getLogger().debug("Not included URL " + url); } return false; } /** * Helper class implementing an Iterator * <p> * This Iterator implementation calculates the links of an URL * before returning in the next() method. * </p> * * @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a> * @version $Id$ */ public static class CocoonCrawlerIterator implements Iterator { private SimpleCocoonCrawlerImpl cocoonCrawler; /** * Constructor for the CocoonCrawlerIterator object * * @param cocoonCrawler the containing CocoonCrawler instance. */ CocoonCrawlerIterator(SimpleCocoonCrawlerImpl cocoonCrawler) { this.cocoonCrawler = cocoonCrawler; } /** * check if crawling is finished. * * @return <code>true</code> if crawling has finished, * else <code>false</code>. */ public boolean hasNext() { return cocoonCrawler.urlsToProcess.size() > 0 || cocoonCrawler.urlsNextDepth.size() > 0; } /** * @return the next URL */ public Object next() { if (cocoonCrawler.urlsToProcess.size() == 0 && cocoonCrawler.urlsNextDepth.size() > 0) { // process queued urls belonging to the next depth level cocoonCrawler.urlsToProcess = cocoonCrawler.urlsNextDepth; cocoonCrawler.urlsNextDepth = new HashSet(); // fix Bugzilla Bug 25270 // only decrease if depth > 0, excluding decreasing // if depth is already equal to -1 if (cocoonCrawler.depth > 0) { cocoonCrawler.depth--; } } URL theNextUrl = null; // fix Bugzilla Bug 25270 // return NextUrl != null only if getLinks() returns non-null // list for (Iterator i = cocoonCrawler.urlsToProcess.iterator(); i.hasNext() && theNextUrl == null;) { // fetch a URL URL url = (URL) i.next(); // remove it from the to-do list i.remove(); if (cocoonCrawler.depth == -1 || cocoonCrawler.depth > 0) { // calc all links from this url List url_links = cocoonCrawler.getLinks(url); if (url_links != null) { // add links of this url to the to-do list cocoonCrawler.urlsNextDepth.addAll(url_links); theNextUrl = url; } } } // finally return url return theNextUrl; } /** * remove is not implemented */ public void remove() { throw new UnsupportedOperationException("remove is not implemented"); } } }