/*
* Efficient crawl library implemented with NIO.
* http://www.niocchi.com
* Copyright (C) 2009 François-Louis Mommens.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.github.mefi.jkuuza.crawler;
import java.util.HashMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.niocchi.core.URLPool;
import org.niocchi.core.URLPoolException;
import org.niocchi.core.Query;
import org.niocchi.core.QueryStatus;
/**
* this class is an URLPool wrapper that drops all subsequent Queries
* from hosts that have reached too many consecutive timeouts.
* Implementation detail: the dropQuery methods calls
* _url_pool.setProcessed with a singleton Resource by assuming
* _url_pool will not store it.
*
* @author FL Mommens
* @author Marek Pilecky
*/
public class TimeoutURLPool implements URLPool {
private static Log _logger = LogFactory.getLog(TimeoutURLPool.class);
protected static int _DEFAULT_MAX_TIMEOUTS = 10;
URLPool _url_pool;
int _max_timeouts = _DEFAULT_MAX_TIMEOUTS;
HashMap<String, Integer> _timeout_map = new HashMap<String, Integer>(); // number of timeout per website
// ------------------------------------------------------------
public TimeoutURLPool(URLPool pool_) {
_url_pool = pool_;
}
// ------------------------------------------------------------
public boolean hasNextQuery() {
return _url_pool.hasNextQuery();
}
// ------------------------------------------------------------
public Query getNextQuery() throws URLPoolException {
while (_url_pool.hasNextQuery()) {
Query query = _url_pool.getNextQuery();
if (query == null) {
return null; // no query ready yet
}
Integer ti = _timeout_map.get(query.getHost());
if (ti == null) {
ti = 0;
}
if (ti >= _max_timeouts) {
_logger.debug("Dropping URL[ " + query.getURL() + " ]");
query.setStatus(QueryStatus.DROPPED);
// We send the query back because
// the URLPools has to receive all
// the processed or dropped queries
_url_pool.setProcessed(query);
continue;
}
return query;
}
return null;
}
// ------------------------------------------------------------
public void setProcessed(Query query) {
String host = query.getHost();
if (query.getStatus() == QueryStatus.TIMEOUT) {
Integer ti = _timeout_map.get(host);
if (ti == null) {
ti = 0;
}
ti++;
_timeout_map.put(host, ti);
if (ti >= _max_timeouts) {
_logger.info("Maximun timeouts reached per Host[ " + query.getURL().getProtocol() + "://" + host + " ] It will be dropped.");
}
} else {
_timeout_map.remove(host);
}
_url_pool.setProcessed(query);
}
// ------------------------------------------------------------
public void setMaxConsecutiveTimeouts(int max_) {
_max_timeouts = max_;
}
// start of chaged part by Marek Pilecky
public URLPool getUrlPool() {
return _url_pool;
}
//end of chaged part
}