package org.dspace.app.stats;
import java.io.*;
import java.sql.SQLException;
import java.util.List;
import java.util.ArrayList;
import java.util.StringTokenizer;
import java.util.regex.*;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.MissingOptionException;
import org.apache.commons.cli.MissingArgumentException;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.dspace.core.Context;
import org.dspace.storage.rdbms.DatabaseManager;
import org.dspace.storage.rdbms.TableRow;
import org.dspace.storage.rdbms.TableRowIterator;
class SpiderDetector
{
private static boolean isVerbose = false;
private static String pathToWeblog = null;
private static List ipSpider = new ArrayList();
private static List agents = new ArrayList();
private static List insertedSpider = new ArrayList();
private static List insertedAgent = new ArrayList();
private static List compareStrings = new ArrayList();
public static void main(String[] args)
{
readCommandLineOptions(args);
Context context = null;
try
{
context = new Context();
findSpiders(context, pathToWeblog);
}
catch (SQLException e)
{
System.out.println("Database error: " + e.getMessage());
}
finally
{
if ((context != null) && context.isValid())
context.abort();
}
}
private static void findSpiders(Context context, String filePath) throws SQLException
{
try
{
BufferedReader in = new BufferedReader(new FileReader(filePath));
String lastDate = null;
String line;
boolean find = false;
String expression = "^([^ ]+) ([^ ]+) ([^ ]+) \\[(.*)\\] \"(.*) (.*) (.*)\" ([0-9\\-]+) ([0-9\\-]+) \"(.*)\" \"(.*)\"$";
String ip = null;
String date = null;
String request = null;
String agent = null;
String referer = null;
getIpSpiders(context);
getAgents(context);
getCompareStrings(context);
// get last processed date
String myQuery = "select * from stats.control";
TableRowIterator iterator = DatabaseManager.query(context, myQuery);
if (iterator.hasNext())
{
TableRow row = iterator.next();
lastDate = row.getStringColumn("last_line_log");
}
//
if (lastDate == null) find = true;
while ((line = in.readLine()) != null)
{
Pattern pattern = Pattern.compile(expression);
Matcher matcher = pattern.matcher(line);
if(matcher.matches())
{
ip = matcher.group(1);
date = matcher.group(4);
request = matcher.group(6);
agent = matcher.group(11);
referer = matcher.group(10);
if (!find && date.equals(lastDate)) find = true;
if (find)
{
if (!isSpider(ip))
{
if (isAgent(agent))
{
if (!insertedSpider.contains(ip))
{
insertAgentStaging(context, ip, agent, 1);
insertedSpider.add(ip);
}
}
else
{
boolean match = false;
for(int i=0;i<compareStrings.size();i++)
{
if (agent.indexOf((String)compareStrings.get(i)) >= 0)
{
match = true;
break;
}
}
if (match)
{
if (!insertedAgent.contains(agent))
{
insertAgentStaging(context, ip, agent, 2);
insertedAgent.add(agent);
}
}
else
{
if (request.indexOf("testpage.html")>=0)
{
if (referer.equals("-"))
{
insertAgentStaging(context, ip, agent, 3);
insertedAgent.add(agent);
}
}
else
{
if (request.indexOf("robots.txt")>=0 && !isSpider(ip))
{
if (!insertedAgent.contains(agent))
{
insertAgentStaging(context, ip, agent, 4);
insertedAgent.add(agent);
}
}
}
}
}
}
}
}
context.commit();
}
in.close();
// set last processed date
if (date != null)
{
String sql = "update stats.control set last_line_log='" + date + "'";
DatabaseManager.updateQuery(context, sql);
context.commit();
}
//
}
catch(IOException e)
{
System.err.println("Error: Can't read file: " + filePath);
}
}
private static void insertAgentStaging(Context context, String ip, String agent, int type)
{
String sql = "";
try
{
sql = "select agent_id from stats.agent_staging where ip=? and name =?";
TableRowIterator iterator = DatabaseManager.query(context, sql, ip, agent);
if (!iterator.hasNext())
{
sql = "insert into stats.agent_staging values (getnextid('stats.agent_staging'),?, ?, ?)";
DatabaseManager.updateQuery(context, sql, agent, ip, type);
}
}
catch (SQLException e)
{
System.out.println("Cant execute sql: " + sql);
}
}
private static void getIpSpiders(Context context) throws SQLException
{
String sql = "SELECT distinct ip FROM stats.ip_spider ;";
TableRowIterator iterator = DatabaseManager.query(context, sql);
while (iterator.hasNext())
{
TableRow r = iterator.next();
ipSpider.add(r.getStringColumn("ip"));
}
}
private static boolean isSpider(String ip) throws SQLException
{
return ipSpider.contains(ip);
}
private static void getAgents(Context context) throws SQLException
{
String sql = "SELECT * FROM stats.agent ;";
TableRowIterator iterator = DatabaseManager.query(context, sql);
while (iterator.hasNext())
{
TableRow r = iterator.next();
agents.add(r.getStringColumn("name"));
}
}
private static boolean isAgent(String agent)
{
return agents.contains(agent);
}
private static void getCompareStrings(Context context) throws SQLException
{
String sql = "SELECT * FROM stats.temp order by name;";
TableRowIterator iterator = DatabaseManager.query(context, sql);
while (iterator.hasNext())
{
TableRow r = iterator.next();
compareStrings.add(r.getStringColumn("name"));
}
}
private static Options setCommandLineOptions()
{
// create an options object and populate it
Options options = new Options();
OptionBuilder.withLongOpt("path");
OptionBuilder.withDescription(
"Run the detection spiders for the specified weblog file.\n" +
"Must be in combined format");
Option path = OptionBuilder.create('p');
path.setArgs(1);
options.addOption(path);
options.addOption("v", "verbose", false, "print detection logging to STDOUT");
options.addOption("h", "help", false, "help");
return options;
}
private static void readCommandLineOptions(String[] argv)
{
// set up command line parser
CommandLineParser parser = new PosixParser();
CommandLine line = null;
Options options = setCommandLineOptions();
try
{
line = parser.parse(options, argv);
}
catch(MissingArgumentException e)
{
System.out.println("Missing Argument: " + e.getMessage());
new HelpFormatter().printHelp("SpiderDetector\n", options);
System.exit(1);
}
catch (ParseException e)
{
System.out.println("ERROR: " + e.getMessage());
new HelpFormatter().printHelp("SpiderDetector\n", options);
System.exit(1);
}
if (line.hasOption('h'))
{
new HelpFormatter().printHelp("SpiderDetector\n", options);
System.exit(0);
}
if (line.hasOption('v'))
{
isVerbose = true;
}
if (!line.hasOption('p'))
{
System.out.println("You have to specify a path to the weblog file\n");
new HelpFormatter().printHelp("SpiderDetector\n", options);
System.exit(0);
}
if(line.hasOption('p'))
{
pathToWeblog = line.getOptionValue('p');
}
}
}