/** * $Id: $ * $URL: $ * ************************************************************************* * Copyright (c) 2002-2009, DuraSpace. All rights reserved * Licensed under the DuraSpace Foundation License. * * A copy of the DuraSpace License has been included in this * distribution and is available at: http://scm.dspace.org/svn/repo/licenses/LICENSE.txt */ package org.dspace.statistics.util; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.Options; import org.apache.commons.cli.PosixParser; import java.io.*; import java.util.HashSet; /** * @author Mark Diggory (mdiggory at atmire.com) * @author kevinvandevelde at atmire.com * @author ben at atmire.com */ public class ApacheLogRobotsProcessor { /** * Creates a file containing spiders based on an apache logfile * by analyzing users of the robots.txt file * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { // create an options object and populate it CommandLineParser parser = new PosixParser(); Options options = new Options(); options.addOption("l", "logfile", true, "type: Input log file"); options.addOption("s", "spiderfile", true, "type: Spider ip file"); CommandLine line = parser.parse(options, args); String logFileLoc; String spiderIpPath; if (line.hasOption("l")) logFileLoc = line.getOptionValue("l"); else { System.out.println("We need our log file"); return; } if (line.hasOption("s")) spiderIpPath = line.getOptionValue("s"); else { System.out.println("We need a spider ip output file"); return; } File spiderIpFile = new File(spiderIpPath); //Get the ip's already added in our file HashSet<String> logSpiders = new HashSet<String>(); if (spiderIpFile.exists()) logSpiders = SpiderDetector.readIpAddresses(spiderIpFile); //First read in our log file line per line BufferedReader in = new BufferedReader(new FileReader(logFileLoc)); String logLine; while ((logLine = in.readLine()) != null) { //Currently only check if robot.txt is present in our line if (logLine.contains("robots.txt")) { //We got a robots.txt so we got a bot String ip = logLine.substring(0, logLine.indexOf("-")).trim(); //Only add single ip addresses once we got it in it is enough logSpiders.add(ip); } } in.close(); //Last but not least add the ips to our file BufferedWriter output = new BufferedWriter(new FileWriter(spiderIpFile)); //Second write the new ips for (String ip : logSpiders) { System.out.println("Adding new ip: " + ip); //Write each new ip on a seperate line output.write(ip + "\n"); } output.flush(); output.close(); } }