/**
* $Id: SpiderDetector.java 4745 2010-02-07 17:44:17Z mdiggory $
* $URL: https://scm.dspace.org/svn/repo/dspace/tags/dspace-1.6.2/dspace-stats/src/main/java/org/dspace/statistics/util/SpiderDetector.java $
* *************************************************************************
* Copyright (c) 2002-2009, DuraSpace. All rights reserved
* Licensed under the DuraSpace Foundation License.
*
* A copy of the DuraSpace License has been included in this
* distribution and is available at: http://scm.dspace.org/svn/repo/licenses/LICENSE.txt
*/
package org.dspace.statistics.util;
import org.apache.log4j.Logger;
import org.dspace.core.ConfigurationManager;
import org.dspace.statistics.SolrLogger;
import javax.servlet.http.HttpServletRequest;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
/**
* SpiderDetector is used to find IP's that are spiders...
* In future someone may add UserAgents and Host Domains
* to the detection criteria here.
*
* @author kevinvandevelde at atmire.com
* @author ben at atmire.com
* @author Mark Diggory (mdiggory at atmire.com)
*/
public class SpiderDetector {
private static Logger log = Logger.getLogger(SpiderDetector.class);
/**
* Sparse HAshTable structure to hold IP Address Ranges.
*/
private static IPTable table = null;
/**
* Utility method which Reads the ip addresses out a file & returns them in a Set
*
* @param spiderIpFile the location of our spider file
* @return a vector full of ip's
* @throws IOException could not happen since we check the file be4 we use it
*/
public static HashSet<String> readIpAddresses(File spiderIpFile) throws IOException {
HashSet<String> ips = new HashSet<String>();
if (!spiderIpFile.exists() || !spiderIpFile.isFile())
return ips;
//Read our file & get all them ip's
BufferedReader in = new BufferedReader(new FileReader(spiderIpFile));
String line;
while ((line = in.readLine()) != null) {
if (!line.startsWith("#")) {
line = line.trim();
if (!line.equals("") && !Character.isDigit(line.charAt(0))) {
// is a hostname
// add this functionality later...
} else if (!line.equals("")) {
ips.add(line);
// is full v4 ip (too tired to deal with v6)...
}
} else {
// ua.add(line.replaceFirst("#","").replaceFirst("UA","").trim());
// ... add this functionality later
}
}
in.close();
return ips;
}
/**
* Get an immutable Set representing all the Spider Addresses here
*
* @return
*/
public static Set<String> getSpiderIpAddresses() {
loadSpiderIpAddresses();
return table.toSet();
}
/*
private loader to populate the table from files.
*/
private static void loadSpiderIpAddresses() {
if (table == null) {
table = new IPTable();
String filePath = ConfigurationManager.getProperty("dspace.dir");
try {
File spidersDir = new File(filePath, "config/spiders");
if (spidersDir.exists() && spidersDir.isDirectory()) {
for (File file : spidersDir.listFiles()) {
for (String ip : readIpAddresses(file)) {
table.add(ip);
}
log.info("Loaded Spider IP file: " + file);
}
} else {
log.info("No spider file loaded");
}
}
catch (Exception e) {
log.error("Error Loading Spiders:" + e.getMessage(), e);
}
}
}
/**
* Static Service Method for testing spiders against existing spider files.
* <p/>
* In the future this will be extended to support User Agent and
* domain Name detection.
* <p/>
* In future spiders HashSet may be optimized as byte offset array to
* improve performance and memory footprint further.
*
* @param request
* @return true|false if the request was detected to be from a spider
*/
public static boolean isSpider(HttpServletRequest request) {
if (SolrLogger.isUseProxies() && request.getHeader("X-Forwarded-For") != null) {
/* This header is a comma delimited list */
for (String xfip : request.getHeader("X-Forwarded-For").split(",")) {
if (isSpider(xfip))
return true;
}
}
return isSpider(request.getRemoteAddr());
}
/**
* Check individual IP is a spider.
*
* @param ip
* @return if is spider IP
*/
public static boolean isSpider(String ip) {
if (table == null) {
SpiderDetector.loadSpiderIpAddresses();
}
try {
if (table.contains(ip)) {
return true;
}
} catch (Exception e) {
return false;
}
return false;
}
}