LogAnalyser.java example

Explorer
CORISCO2-master
- adore-djatoka-1.1-corisco-1
  - src
    - java
      - gov
        lanl
        adore
        djatoka
        DjatokaCompress.java
        DjatokaConstants.java
        DjatokaDecodeParam.java
        DjatokaEncodeParam.java
        DjatokaException.java
        DjatokaExtract.java
        DjatokaExtractProcessor.java
        ICompress.java
        IExtract.java
        io
        ExtractorFactory.java
        FormatConstants.java
        FormatFactory.java
        FormatIOException.java
        FormatWriterParams.java
        IReader.java
        IWriter.java
        reader
        DjatokaReader.java
        ImageIOReader.java
        ImageJReader.java
        PNMReader.java
        writer
        BMPWriter.java
        GIFWriter.java
        JP2Writer.java
        JPGWriter.java
        PNGWriter.java
        PNMWriter.java
        TIFWriter.java
        kdu
        KduCompressExe.java
        KduExtractExe.java
        jni
        KduCompressedSource.java
        KduExtractJNI.java
        KduExtractProcessorJNI.java
        openurl
        DjatokaImageMigrator.java
        IReferentMigrator.java
        IReferentResolver.java
        IdentifierNotFoundException.java
        OpenURLJP2Datastream.java
        OpenURLJP2KMetadata.java
        OpenURLJP2KService.java
        OpenURLJP2Ping.java
        OpenURLJP2XML.java
        OpenURLServlet.java
        ReferentManager.java
        ResolverException.java
        SimpleListResolver.java
        TileCacheManager.java
        plugin
        dspace
        DSpaceResolver.java
        rftdb
        DatabaseResolver.java
        plugin
        ExtractJPG.java
        ExtractPDF.java
        ITransformPlugIn.java
        ImageWatermark.java
        TextWatermark.java
        TransformException.java
        util
        IOUtils.java
        ImageProcessingUtils.java
        ImageRecord.java
        ImageRecordUtils.java
        JP2ImageInfo.java
        JP2Markers.java
        SourceImageFileFilter.java
        util
        AccessManager.java
        ConfigurationManager.java
        DBCPUtils.java
        DjatokaContextListener.java
        ExecuteStreamHandler.java
        HttpDate.java
        PumpStreamHandler.java
        StreamPumper.java
- dspace-1.6.2-src-release-corisco-1
/*
 * LogAnalyser.java
 *
 * Version: $Revision: 4735 $
 *
 * Date: $Date: 2010-02-01 23:11:43 +0000 (Mon, 01 Feb 2010) $
 *
 * Copyright (c) 2002-2009, The DSpace Foundation.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 * - Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the
 * documentation and/or other materials provided with the distribution.
 *
 * - Neither the name of the DSpace Foundation nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 */

package org.dspace.app.statistics;

import org.dspace.app.statistics.LogLine;

import org.dspace.core.ConfigurationManager;
import org.dspace.core.Context;
import org.dspace.core.LogManager;
import org.dspace.storage.rdbms.DatabaseManager;
import org.dspace.storage.rdbms.TableRow;

import java.sql.SQLException;

import java.lang.Long;
import java.lang.StringBuffer;

import java.text.ParseException;
import java.text.SimpleDateFormat;

import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.StringTokenizer;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;

/**
 * This class performs all the actual analysis of a given set of DSpace log
 * files.  Most input can be configured; use the -help flag for a full list
 * of usage information.
 *
 * The output of this file is plain text and forms an "aggregation" file which
 * can then be used for display purposes using the related ReportGenerator
 * class.
 *
 * @author  Richard Jones
 */
public class LogAnalyser 
{
    
    // set up our class globals
    // FIXME: there are so many of these perhaps they should exist in a static
    // object of their own
    
    /////////////////
    // aggregators
    /////////////////
    
    /** aggregator for all actions performed in the system */
    private static Map actionAggregator;
    
    /** aggregator for all searches performed */
    private static Map searchAggregator;
    
    /** aggregator for user logins */
    private static Map userAggregator;
    
    /** aggregator for item views */
    private static Map itemAggregator;
    
    /** aggregator for current archive state statistics */
    private static Map archiveStats;
    
    /** warning counter */
    private static int warnCount = 0;
    
    /** log line counter */
    private static int lineCount = 0;
        
    //////////////////
    // config data
    //////////////////
    
    /** list of actions to be included in the general summary */
    private static List generalSummary;
    
    /** list of words not to be aggregated */
    private static List excludeWords;
    
    /** list of search types to be ignored, such as "author:" */
    private static List excludeTypes;
    
    /** list of characters to be excluded */
    private static List excludeChars;
    
    /** list of item types to be reported on in the current state */
    private static List itemTypes;
    
    /** bottom limit to output for search word analysis */
    private static int searchFloor;
    
    /** bottom limit to output for item view analysis */
    private static int itemFloor;
    
    /** number of items from most popular to be looked up in the database */
    private static int itemLookup;
    
    /** mode to use for user email display */
    private static String userEmail;
    
    /** URL of the service being analysed */
    private static String url;
    
    /** Name of the service being analysed */
    private static String name;
   
    /** Name of the service being analysed */
    private static String hostName;
    
    /** the average number of views per item */
    private static int views = 0;
    
    ///////////////////////
    // regular expressions
    ///////////////////////
   
   /** Exclude characters regular expression pattern */
   private static Pattern excludeCharRX = null;
   
   /** handle indicator string regular expression pattern */
   private static Pattern handleRX = null;
   
   /** item id indicator string regular expression pattern */
   private static Pattern itemRX = null;
  
   /** query string indicator regular expression pattern */
   private static Pattern queryRX = null;
   
   /** collection indicator regular expression pattern */
   private static Pattern collectionRX = null;
   
   /** community indicator regular expression pattern */
   private static Pattern communityRX = null;
   
   /** results indicator regular expression pattern */
   private static Pattern resultsRX = null;
   
   /** single character regular expression pattern */
   private static Pattern singleRX = null;
   
   /** a pattern to match a valid version 1.3 log file line */
   private static Pattern valid13 = null;
   
   /** a pattern to match a valid version 1.4 log file line */
   private static Pattern valid14 = null;
   
   /** pattern to match valid log file names */
   private static Pattern logRegex = null;
   
   /** pattern to match commented out lines from the config file */
   private static Pattern comment = Pattern.compile("^#");
        
   /** pattern to match genuine lines from the config file */
   private static Pattern real = Pattern.compile("^(.+)=(.+)");
   
   /** pattern to match all search types */
   private static Pattern typeRX = null;
   
   /** pattern to match all search types */
   private static Pattern wordRX = null;
   
   //////////////////////////
   // Miscellaneous variables
   //////////////////////////
   
   /** process timing clock */
   private static Calendar startTime = null;
   
   /////////////////////////
   // command line options
   ////////////////////////
   
   /** the log directory to be analysed */
   private static String logDir = ConfigurationManager.getProperty("log.dir");  
        
   /** the regex to describe the file name format */
   private static String fileTemplate = "dspace\\.log.*";
        
   /** the config file from which to configure the analyser */
   public static String configFile = ConfigurationManager.getProperty("dspace.dir") + 
                            File.separator + "config" + File.separator +
                            "dstat.cfg";
   
   /** the output file to which to write aggregation data */
   private static String outFile = ConfigurationManager.getProperty("log.dir") + File.separator + "dstat.dat";
   
   /** the starting date of the report */
   private static Date startDate = null;
        
   /** the end date of the report */
   private static Date endDate = null;
        
   /** the starting date of the report as obtained from the log files */
   private static Date logStartDate = null;
        
   /** the end date of the report as obtained from the log files */
   private static Date logEndDate = null;
   
   /** are we looking stuff up in the database */
   private static boolean lookUp = false;
        
   
    /**
     * main method to be run from command line.  See usage information for
     * details as to how to use the command line flags (-help)
     */
    public static void main(String [] argv)
        throws Exception, SQLException
    {
        // first, start the processing clock
        startTime = new GregorianCalendar();
        
        // create context as super user
        Context context = new Context();
        context.setIgnoreAuthorization(true);
        
        // set up our command line variables
        String myLogDir = null;
        String myFileTemplate = null;
        String myConfigFile = null;
        String myOutFile = null;
        Date myStartDate = null;
        Date myEndDate = null;
        boolean myLookUp = false;
        
        // read in our command line options
        for (int i = 0; i < argv.length; i++)
        {
            if (argv[i].equals("-log"))
            {
                myLogDir = argv[i+1];
            }
            
            if (argv[i].equals("-file"))
            {
                myFileTemplate = argv[i+1];
            }
            
            if (argv[i].equals("-cfg"))
            {
                myConfigFile = argv[i+1];
            }
            
            if (argv[i].equals("-out"))
            {
                myOutFile = argv[i+1];
            }
            
            if (argv[i].equals("-help"))
            {
                LogAnalyser.usage();
                System.exit(0);
            }
            
            if (argv[i].equals("-start"))
            {
                myStartDate = parseDate(argv[i+1]);
            }
            
            if (argv[i].equals("-end"))
            {
                myEndDate = parseDate(argv[i+1]);
            }
            
            if (argv[i].equals("-lookup"))
            {
                myLookUp = true;
            }
        }
        
        // now call the method which actually processes the logs
        processLogs(context, myLogDir, myFileTemplate, myConfigFile, myOutFile, myStartDate, myEndDate, myLookUp);
    }
    
    /**
     * using the pre-configuration information passed here, analyse the logs
     * and produce the aggregation file
     *
     * @param   context     the DSpace context object this occurs under
     * @param   myLogDir    the passed log directory.  Uses default if null
     * @param   myFileTemplate  the passed file name regex.  Uses default if null
     * @param   myConfigFile    the DStat config file.  Uses default if null
     * @param   myOutFile    the file to which to output aggregation data.  Uses default if null
     * @param   myStartDate     the desired start of the analysis.  Starts from the beginning otherwise
     * @param   myEndDate       the desired end of the analysis.  Goes to the end otherwise
     * @param   myLookUp        force a lookup of the database
     */
    public static void processLogs(Context context, String myLogDir, 
                                    String myFileTemplate, String myConfigFile, 
                                    String myOutFile, Date myStartDate, 
                                    Date myEndDate, boolean myLookUp)
        throws IOException, SQLException
    {
        // FIXME: perhaps we should have all parameters and aggregators put 
        // together in a single aggregating object
        
        // if the timer has not yet been started, then start it
        startTime = new GregorianCalendar();
                
        //instantiate aggregators
        actionAggregator = new HashMap();
        searchAggregator = new HashMap();
        userAggregator = new HashMap();
        itemAggregator = new HashMap();
        archiveStats = new HashMap();
        
        //instantiate lists
        generalSummary = new ArrayList();
        excludeWords = new ArrayList();
        excludeTypes = new ArrayList();
        excludeChars = new ArrayList();
        itemTypes = new ArrayList();
              
        // set the parameters for this analysis
        setParameters(myLogDir, myFileTemplate, myConfigFile, myOutFile, myStartDate, myEndDate, myLookUp);
        
        // pre prepare our standard file readers and buffered readers
        FileReader fr = null;
        BufferedReader br = null;
        
        // read in the config information, throwing an error if we fail to open
        // the given config file
        readConfig(configFile);
        
        // assemble the regular expressions for later use (requires the file
        // template to build the regex to match it
        setRegex(fileTemplate);
        
        // get the log files
        File[] logFiles = getLogFiles(logDir);
        
        // standard loop counter
        int i = 0;
        
        // for every log file do analysis
        // FIXME: it is easy to implement not processing log files after the
        // dates exceed the end boundary, but is there an easy way to do it
        // for the start of the file?  Note that we can assume that the contents
        // of the log file are sequential, but can we assume the files are
        // provided in a data sequence?
        for (i = 0; i < logFiles.length; i++)
        {
            // check to see if this file is a log file agains the global regex
            Matcher matchRegex = logRegex.matcher(logFiles[i].getName());
            if (matchRegex.matches())
            {
                // if it is a log file, open it up and lets have a look at the
                // contents.
                try 
                {  
                    fr = new FileReader(logFiles[i].toString());  
                    br = new BufferedReader(fr);
                } 
                catch (IOException e) 
                {  
                    System.out.println("Failed to read log file " + logFiles[i].toString());
                    System.exit(0);
                } 

                // for each line in the file do the analysis
                // FIXME: perhaps each section needs to be dolled out to an
                // analysing class to allow pluggability of other methods of
                // analysis, and ease of code reading too - Pending further thought
                String line = null;
                while ((line = br.readLine()) != null)
                {
                    // get the log line object
                    LogLine logLine = getLogLine(line);
                    
                    // if there are line segments get on with the analysis
                    if (logLine != null)
                    {
                        // first find out if we are constraining by date and 
                        // if so apply the restrictions
                        if ((startDate != null) && (!logLine.afterDate(startDate)))
                        {
                            continue;
                        }
                        
                        if ((endDate !=null) && (!logLine.beforeDate(endDate)))
                        {
                            break;
                        }
                        
                        // count the number of lines parsed
                        lineCount++;
                        
                        // if we are not constrained by date, register the date
                        // as the start/end date if it is the earliest/latest so far
                        // FIXME: this should probably have a method of its own
                        if (startDate == null)
                        {
                            if (logStartDate != null)
                            {
                                if (logLine.beforeDate(logStartDate))
                                {
                                    logStartDate = logLine.getDate();
                                }
                            }
                            else
                            {
                                logStartDate = logLine.getDate();
                            }
                        }
                        
                        if (endDate == null)
                        {
                            if (logEndDate != null)
                            {
                                if (logLine.afterDate(logEndDate))
                                {
                                    logEndDate = logLine.getDate();
                                }
                            }
                            else
                            {
                                logEndDate = logLine.getDate();
                            }
                        }
                        
                        // count the warnings
                        if (logLine.isLevel("WARN"))
                        {
                            // FIXME: really, this ought to be some kind of level
                            // aggregator
                            warnCount++;
                        }

                        // is the action a search?
                        if (logLine.isAction("search"))
                        {
                            // get back all the valid search words from the query
                            String[] words = analyseQuery(logLine.getParams());
                            
                            // for each search word add to the aggregator or
                            // increment the aggregator's counter
                            for (int j = 0; j < words.length; j++)
                            {
                                // FIXME: perhaps aggregators ought to be objects
                                // themselves
                                searchAggregator.put(words[j], increment(searchAggregator, words[j]));
                            }
                        }

                        // is the action a login, and are we counting user logins?
                        if (logLine.isAction("login") && !userEmail.equals("off"))
                        {
                            userAggregator.put(logLine.getUser(), increment(userAggregator, logLine.getUser()));
                        }

                        // is the action an item view?
                        if (logLine.isAction("view_item"))
                        {
                            String handle = logLine.getParams();

                            // strip the handle string
                            Matcher matchHandle = handleRX.matcher(handle);
                            handle = matchHandle.replaceAll("");
                            
                            // strip the item id string
                            Matcher matchItem = itemRX.matcher(handle);
                            handle = matchItem.replaceAll("");

                            handle.trim();

                            // either add the handle to the aggregator or
                            // increment its counter
                            itemAggregator.put(handle, increment(itemAggregator, handle));
                        }

                        // log all the activity
                        actionAggregator.put(logLine.getAction(), increment(actionAggregator, logLine.getAction()));
                    }
                }

                // close the file reading buffers
                br.close();
                fr.close();

            }
        }
        
        // do we want to do a database lookup?  Do so only if the start and
        // end dates are null or lookUp is true
        // FIXME: this is a kind of separate section.  Would it be worth building
        // the summary string separately and then inserting it into the real
        // summary later?  Especially if we make the archive analysis more complex
        archiveStats.put("All Items", getNumItems(context));
        for (i = 0; i < itemTypes.size(); i++)
        {
            archiveStats.put(itemTypes.get(i), getNumItems(context, (String) itemTypes.get(i)));
        }
        
        // now do the host name and url lookup
        hostName = ConfigurationManager.getProperty("dspace.hostname").trim();
        name = ConfigurationManager.getProperty("dspace.name").trim();
        url = ConfigurationManager.getProperty("dspace.url").trim();
        if ((url != null) && (!url.endsWith("/")))
        {
                url = url + "/";
        }
        
        // do the average views analysis
        if (((Integer) archiveStats.get("All Items")).intValue() != 0)
        {
            // FIXME: this is dependent on their being a query on the db, which
            // there might not always be if it becomes configurable
            Double avg = new Double(
                        Math.ceil(
                            ((Integer) actionAggregator.get("view_item")).intValue() / 
                            ((Integer) archiveStats.get("All Items")).intValue()));
            views = avg.intValue();
        }
        
        // finally, write the output
        createOutput();

        return;
    }
   
    
    /**
     * set the passed parameters up as global class variables.  This has to
     * be done in a separate method because the API permits for running from
     * the command line with args or calling the processLogs method statically
     * from elsewhere
     *
     * @param   myLogDir    the log file directory to be analysed
     * @param   myFileTemplate  regex for log file names
     * @param   myConfigFile    config file to use for dstat
     * @param   myOutFile   file to write the aggregation into
     * @param   myStartDate requested log reporting start date
     * @param   myEndDate   requested log reporting end date
     * @param   myLookUp    requested look up force flag
     */
    public static void setParameters(String myLogDir, String myFileTemplate, 
                                    String myConfigFile, String myOutFile,
                                    Date myStartDate, Date myEndDate, 
                                    boolean myLookUp)
    {
        if (myLogDir != null)
        {
            logDir = myLogDir;
        }
        
        if (myFileTemplate != null)
        {
            fileTemplate = myFileTemplate;
        }
        
        if (myConfigFile != null)
        {
            configFile = myConfigFile;
        }
        
        if (myStartDate != null)
        {
            startDate = myStartDate;
        }
        
        if (myEndDate != null)
        {
            endDate = myEndDate;
        }
        
        if (myLogDir != null)
        {
            lookUp = myLookUp;
        }
        
        if (myOutFile != null)
        {
            outFile = myOutFile;
        }
        
        return;
    }
    
    
    /**
     * generate the analyser's output to the specified out file
     */
    public static void createOutput()
    {
        // start a string buffer to hold the final output
        StringBuffer summary = new StringBuffer();
        
        // define an iterator that will be used to go over the hashmap keys
        Iterator keys = null;
        
        // output the number of lines parsed
        summary.append("log_lines=" + Integer.toString(lineCount) + "\n");
        
        // output the number of warnings encountered
        summary.append("warnings=" + Integer.toString(warnCount) + "\n");
        
        // set the general summary config up in the aggregator file
        for (int i = 0; i < generalSummary.size(); i++)
        {
            summary.append("general_summary=" + generalSummary.get(i) + "\n");
        }
        
        // output the host name
        summary.append("server_name=" + hostName + "\n");
        
        // output the service name 
        summary.append("service_name=" + name + "\n");
        
        // output the date information if necessary
        SimpleDateFormat sdf = new SimpleDateFormat("dd'/'MM'/'yyyy");
        
        if (startDate != null)
        {
            summary.append("start_date=" + sdf.format(startDate) + "\n");
        }
        else if (logStartDate != null)
        {
            summary.append("start_date=" + sdf.format(logStartDate) + "\n");
        }
        
        if (endDate != null)
        {
            summary.append("end_date=" + sdf.format(endDate) + "\n");
        }
        else if (logEndDate != null)
        {
            summary.append("end_date=" + sdf.format(logEndDate) + "\n");
        }
        
        // write out the archive stats
        keys = archiveStats.keySet().iterator();
        while (keys.hasNext())
        {
            String key = (String) keys.next();
            summary.append("archive." + key + "=" + archiveStats.get(key) + "\n");
        }
        
        // write out the action aggregation results
        keys = actionAggregator.keySet().iterator();
        while (keys.hasNext())
        {
            String key = (String) keys.next();
            summary.append("action." + key + "=" + actionAggregator.get(key) + "\n");
        }
        
        // depending on the config settings for reporting on emails output the
        // login information
        summary.append("user_email=" + userEmail + "\n");
        int address = 1;
        keys = userAggregator.keySet().iterator();

        // for each email address either write out the address and the count
        // or alias it with an "Address X" label, to keep the data confidential
        // FIXME: the users reporting should also have a floor value
        while (keys.hasNext())
        {
            String key = (String) keys.next();
            summary.append("user.");
            if (userEmail.equals("on"))
            {
                summary.append(key + "=" + userAggregator.get(key) + "\n");
            }
            else if (userEmail.equals("alias"))
            {
                summary.append("Address " + Integer.toString(address++) + "=" + userAggregator.get(key) + "\n");
            }
        }
        
        // FIXME: all values which have floors set should provide an "other"
        // record which counts how many other things which didn't make it into
        // the listing there are
        
        // output the search word information
        summary.append("search_floor=" + searchFloor + "\n");
        keys = searchAggregator.keySet().iterator();
        while (keys.hasNext())
        {
            String key = (String) keys.next();
            if (((Integer) searchAggregator.get(key)).intValue() >= searchFloor)
            {
                summary.append("search." + key + "=" + searchAggregator.get(key) + "\n");
            }
        }
        
        // FIXME: we should do a lot more with the search aggregator
        // Possible feature list:
        //  - constrain by collection/community perhaps?
        //  - we should consider building our own aggregator class which can
        //      be full of rich data.  Perhaps this and the Stats class should
        //      be the same thing.
        
        // item viewing information
        summary.append("item_floor=" + itemFloor + "\n");
        summary.append("host_url=" + url + "\n");
        summary.append("item_lookup=" + itemLookup + "\n");
        
        // write out the item access information
        keys = itemAggregator.keySet().iterator();
        while (keys.hasNext())
        {
            String key = (String) keys.next();
            if (((Integer) itemAggregator.get(key)).intValue() >= itemFloor)
            {
                summary.append("item." + key + "=" + itemAggregator.get(key) + "\n");
            }
        }
        
        // output the average views per item
        if (views > 0)
        {
            summary.append("avg_item_views=" + views + "\n");
        }
        
        // insert the analysis processing time information
        Calendar endTime = new GregorianCalendar();
        long timeInMillis = (endTime.getTimeInMillis() - startTime.getTimeInMillis());
        summary.append("analysis_process_time=" + Long.toString(timeInMillis / 1000) + "\n");
        
        // finally write the string into the output file
        try 
        {
            BufferedWriter out = new BufferedWriter(new FileWriter(outFile));
            out.write(summary.toString());
            out.flush();
            out.close();
        } 
        catch (IOException e) 
        {
            System.out.println("Unable to write to output file " + outFile);
            System.exit(0);
        }
        
        return;
    }
    
    
    /**
     * get an array of file objects representing the passed log directory
     * 
     * @param   logDir  the log directory in which to pick up files
     *
     * @return  an array of file objects representing the given logDir
     */
    public static File[] getLogFiles(String logDir)
    {
        // open the log files directory, read in the files, check that they
        // match the passed regular expression then analyse the content
        File logs = new File(logDir);
        
        // if log dir is not a directory throw and error and exit
        if (!logs.isDirectory())
        {
            System.out.println("Passed log directory is not a directory");
            System.exit(0);
        }
        
        // get the files in the directory
        return logs.listFiles();
    }
    
    
    /**
     * set up the regular expressions to be used by this analyser.  Mostly this
     * exists to provide a degree of segregation and readability to the code
     * and to ensure that you only need to set up the regular expressions to
     * be used once
     *
     * @param   fileTemplate    the regex to be used to identify dspace log files
     */
    public static void setRegex(String fileTemplate)
    {
        // build the exclude characters regular expression
        StringBuffer charRegEx = new StringBuffer();
        charRegEx.append("[");
        for (int i = 0; i < excludeChars.size(); i++)
        {
            charRegEx.append("\\" + (String) excludeChars.get(i));
        }
        charRegEx.append("]");
        excludeCharRX = Pattern.compile(charRegEx.toString());
        
        // regular expression to find handle indicators in strings
        handleRX = Pattern.compile("handle=");
        
        // regular expression to find item_id indicators in strings
        itemRX = Pattern.compile(",item_id=.*$");
        
        // regular expression to find query indicators in strings
        queryRX = Pattern.compile("query=");
        
        // regular expression to find collections in strings
        collectionRX = Pattern.compile("collection_id=[0-9]*,");
        
        // regular expression to find communities in strings
        communityRX = Pattern.compile("community_id=[0-9]*,");
        
        // regular expression to find search result sets
        resultsRX = Pattern.compile(",results=(.*)");
        
        // regular expressions to find single characters anywhere in the string
        singleRX = Pattern.compile("( . |^. | .$)");
        
        // set up the standard log file line regular expression
        String logLine13 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+):[^:]+:([^:]+):(.*)";
        String logLine14 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+):[^:]+:[^:]+:([^:]+):(.*)";
        valid13 = Pattern.compile(logLine13);
        valid14 = Pattern.compile(logLine14);
        
        // set up the pattern for validating log file names
        logRegex = Pattern.compile(fileTemplate);
        
        // set up the pattern for matching any of the query types
        StringBuffer typeRXString = new StringBuffer();
        typeRXString.append("(");
        for (int i = 0; i < excludeTypes.size(); i++)
        {
            if (i > 0)
            {
                typeRXString.append("|");
            }
            typeRXString.append((String) excludeTypes.get(i));
        }
        typeRXString.append(")");
        typeRX = Pattern.compile(typeRXString.toString());
        
        // set up the pattern for matching any of the words to exclude
        StringBuffer wordRXString = new StringBuffer();
        wordRXString.append("(");
        for (int i = 0; i < excludeWords.size(); i++)
        {
            if (i > 0)
            {
                wordRXString.append("|");
            }
            wordRXString.append(" " + (String) excludeWords.get(i) + " ");
            wordRXString.append("|");
            wordRXString.append("^" + (String) excludeWords.get(i) + " ");
            wordRXString.append("|");
            wordRXString.append(" " + (String) excludeWords.get(i) + "$");
        }
        wordRXString.append(")");
        wordRX = Pattern.compile(wordRXString.toString());
        
        return;
    }
    
    
    /**
     * read in the given config file and populate the class globals
     *
     * @param   configFile  the config file to read in
     */
    public static void readConfig(String configFile)
        throws IOException
    {
        //instantiate aggregators
        actionAggregator = new HashMap();
        searchAggregator = new HashMap();
        userAggregator = new HashMap();
        itemAggregator = new HashMap();
        archiveStats = new HashMap();

        //instantiate lists
        generalSummary = new ArrayList();
        excludeWords = new ArrayList();
        excludeTypes = new ArrayList();
        excludeChars = new ArrayList();
        itemTypes = new ArrayList();

        // prepare our standard file readers and buffered readers
        FileReader fr = null;
        BufferedReader br = null;
        
        String record = null;
        try 
        {  
            fr = new FileReader(configFile);  
            br = new BufferedReader(fr);
        } 
        catch (IOException e) 
        {
            System.out.println("Failed to read config file: " + configFile);
            System.exit(0);
        } 
        
        // read in the config file and set up our instance variables
        while ((record = br.readLine()) != null) 
        {
            // check to see what kind of line we have
            Matcher matchComment = comment.matcher(record);
            Matcher matchReal = real.matcher(record);

            // if the line is not a comment and is real, read it in
            if (!matchComment.matches() && matchReal.matches())
            {
                // lift the values out of the matcher's result groups
                String key = matchReal.group(1).trim();
                String value = matchReal.group(2).trim();
                
                // read the config values into our instance variables (see 
                // documentation for more info on config params)
                if (key.equals("general.summary"))
                {
                    actionAggregator.put(value, new Integer(0));
                    generalSummary.add(value);
                }
                
                if (key.equals("exclude.word"))
                {
                    excludeWords.add(value);
                }

                if (key.equals("exclude.type"))
                {
                    excludeTypes.add(value);
                }

                if (key.equals("exclude.character"))
                {
                    excludeChars.add(value);
                }

                if (key.equals("item.type"))
                {
                    itemTypes.add(value);
                }

                if (key.equals("item.floor"))
                {
                    itemFloor = Integer.parseInt(value);
                }

                if (key.equals("search.floor"))
                {
                    searchFloor = Integer.parseInt(value);
                }

                if (key.equals("item.lookup"))
                {
                    itemLookup = Integer.parseInt(value);
                }

                if (key.equals("user.email"))
                {
                    userEmail = value;
                }
            }
        }

        // close the inputs
        br.close();
        fr.close();
        
        return;
    }
    
    /**
     * increment the value of the given map at the given key by one.
     * 
     * @param   map     the map whose value we want to increase
     * @param   key     the key of the map whose value to increase
     *
     * @return          an integer object containing the new value
     */
    public static Integer increment(Map map, String key)
    {
        Integer newValue = null;
        if (map.containsKey(key))
        {
            // FIXME: this seems like a ridiculous way to add Integers
            newValue = new Integer(((Integer) map.get(key)).intValue() + 1);
        }
        else
        {
            newValue = new Integer(1);
        }
        return newValue;
    }
    
    /**
     * Take the standard date string requested at the command line and convert
     * it into a Date object.  Throws and error and exits if the date does
     * not parse
     *
     * @param   date    the string representation of the date
     *
     * @return          a date object containing the date, with the time set to
     *                  00:00:00
     */
    public static Date parseDate(String date)
    {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy'-'MM'-'dd");
        Date parsedDate = null;
        
        try 
        {
             parsedDate = sdf.parse(date);
        }
        catch (ParseException e)
        {
            System.out.println("The date is not in the correct format");
            System.exit(0);
        }
        return parsedDate;
    }
    
    
    /**
     * Take the date object and convert it into a string of the form YYYY-MM-DD
     *
     * @param   date    the date to be converted
     *
     * @return          A string of the form YYYY-MM-DD
     */
    public static String unParseDate(Date date)
    {
        // Use SimpleDateFormat
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy'-'MM'-'dd");
        return sdf.format(date);
    }
    
    
    /**
     * Take a search query string and pull out all of the meaningful information
     * from it, giving the results in the form of a String array, a single word
     * to each element
     * 
     * @param   query   the search query to be analysed
     *
     * @return          the string array containing meaningful search terms
     */
    public static String[] analyseQuery(String query)
    {
        // register our standard loop counter
        int i = 0;
        
        // make the query string totally lower case, to ensure we don't miss out
        // on matches due to capitalisation
        query = query.toLowerCase();
        
        // now perform successive find and replace operations using pre-defined
        // global regular expressions
        Matcher matchQuery = queryRX.matcher(query);
        query = matchQuery.replaceAll(" ");
        
        Matcher matchCollection = collectionRX.matcher(query);
        query = matchCollection.replaceAll(" ");
        
        Matcher matchCommunity = communityRX.matcher(query);
        query = matchCommunity.replaceAll(" ");
        
        Matcher matchResults = resultsRX.matcher(query);
        query = matchResults.replaceAll(" ");

        Matcher matchTypes = typeRX.matcher(query);
        query = matchTypes.replaceAll(" ");
        
        Matcher matchChars = excludeCharRX.matcher(query);
        query = matchChars.replaceAll(" ");
       
        Matcher matchWords = wordRX.matcher(query);
        query = matchWords.replaceAll(" ");
        
        Matcher single = singleRX.matcher(query);
        query = single.replaceAll(" ");
        
        // split the remaining string by whitespace, trim and stuff into an
        // array to be returned
        StringTokenizer st = new StringTokenizer(query);
        String[] words = new String[st.countTokens()];
        for (i = 0; i < words.length; i++)
        {
            words[i] = st.nextToken().trim();
        }

        // FIXME: some single characters are still slipping through the net;
        // why? and how do we fix it?
        return words;
    }
    
    
    /**
     * split the given line into it's relevant segments if applicable (i.e. the
     * line matches the required regular expression.
     *
     * @param   line    the line to be segmented
     * @return          a Log Line object for the given line
     */
    public static LogLine getLogLine(String line)
    {
        // FIXME: consider moving this code into the LogLine class.  To do this
        // we need to much more carefully define the structure and behaviour
        // of the LogLine class
        Matcher match;
        
        if (line.indexOf(":ip_addr") > 0)
        {
            match = valid14.matcher(line);
        }
        else
        {
            match = valid13.matcher(line);
        }
        
        if (match.matches())
        {
            // set up a new log line object
            LogLine logLine = new LogLine(parseDate(match.group(1).trim()),
                                          LogManager.unescapeLogField(match.group(2)).trim(),
                                          LogManager.unescapeLogField(match.group(3)).trim(),
                                          LogManager.unescapeLogField(match.group(4)).trim(),
                                          LogManager.unescapeLogField(match.group(5)).trim());
            
            return logLine;
        }
        else
        {
            return null;
        }
    }
 
    
    /**
     * get the number of items in the archive which were accessioned between 
     * the provided start and end dates, with the given value for the DC field
     * 'type' (unqualified)
     *
     * @param   context     the DSpace context for the action
     * @param   type        value for DC field 'type' (unqualified)
     *
     * @return              an integer containing the relevant count
     */
    public static Integer getNumItems(Context context, String type)
        throws SQLException
    {
        boolean oracle = false;
        if ("oracle".equals(ConfigurationManager.getProperty("db.name")))
        {
            oracle = true;
        }

        // FIXME: this method is clearly not optimised
        
        // FIXME: we don't yet collect total statistics, such as number of items
        // withdrawn, number in process of submission etc.  We should probably do
        // that
        
        // start the type constraint
        String typeQuery = null;
        
        if (type != null)
        {
            typeQuery = "SELECT item_id " +
                        "FROM metadatavalue " +
                        "WHERE text_value LIKE '%" + type + "%' " +
                        "AND metadata_field_id = (" +
                        " SELECT metadata_field_id " +
                        " FROM metadatafieldregistry " +
                        " WHERE element = 'type' " +
                        " AND qualifier IS NULL) ";
        }
        
        // start the date constraint query buffer
        StringBuffer dateQuery = new StringBuffer();
        if (oracle)
        {
            dateQuery.append("SELECT /*+ ORDERED_PREDICATES */ item_id ");
        }
        else
        {
            dateQuery.append("SELECT item_id ");
        }

        dateQuery.append("FROM metadatavalue " +
                          "WHERE metadata_field_id = (" +
                          " SELECT metadata_field_id " +
                          " FROM metadatafieldregistry " +
                          " WHERE element = 'date' " +
                          " AND qualifier = 'accessioned') ");
      
        if (startDate != null)
        {
            if (oracle)
            {
                dateQuery.append(" AND TO_TIMESTAMP( TO_CHAR(text_value), "+
                        "'yyyy-mm-dd\"T\"hh24:mi:ss\"Z\"' ) > TO_DATE('" +
                        unParseDate(startDate) + "', 'yyyy-MM-dd') ");
            }
            else
            {
                dateQuery.append(" AND text_value::timestamp > '" +
                        unParseDate(startDate) + "'::timestamp ");
            }
        }

        if (endDate != null)
        {
            if (oracle)
            {
                dateQuery.append(" AND TO_TIMESTAMP( TO_CHAR(text_value), "+
                        "'yyyy-mm-dd\"T\"hh24:mi:ss\"Z\"' ) < TO_DATE('" +
                        unParseDate(endDate) + "', 'yyyy-MM-dd') ");
            }
            else
            {
                dateQuery.append(" AND text_value::timestamp < '" +
                        unParseDate(endDate) + "'::timestamp ");
            }
        }
        
        // build the final query
        StringBuffer query = new StringBuffer();
        
        query.append("SELECT COUNT(*) AS num " +
                  "FROM item " +
                  "WHERE in_archive = " + (oracle ? "1 " : "true ") +
                  "AND withdrawn = " + (oracle ? "0 " : "false "));
        
        if (startDate != null || endDate != null)
        {
            query.append(" AND item_id IN ( " +
                         dateQuery.toString() + ") ");
        }

        if (type != null)
        {
            query.append(" AND item_id IN ( " +
                         typeQuery + ") ");
        }
        
        TableRow row = DatabaseManager.querySingle(context, query.toString());

        Integer numItems;
        if (oracle)
        {
            numItems = new Integer(row.getIntColumn("num"));
        }
        else
        {
            // for some reason the number column is of "long" data type!
            Long count = new Long(row.getLongColumn("num"));
            numItems = new Integer(count.intValue());
        }
        return numItems;
    }
    
    
    /**
     * get the total number of items in the archive at time of execution,
     * ignoring all other constraints
     *
     * @param   context     the DSpace context the action is being performed in
     *
     * @return              an Integer containing the number of items in the
     *                      archive
     */
    public static Integer getNumItems(Context context)
        throws SQLException
    {
        return getNumItems(context, null);
    }
    
    
    /**
     * print out the usage information for this class to the standard out
     */
    public static void usage()
    {
        String usage = "Usage Information:\n" +
                        "LogAnalyser [options [parameters]]\n" +
                        "-log [log directory]\n" +
                            "\tOptional\n" +
                            "\tSpecify a directory containing log files\n" +
                            "\tDefault uses [dspace.dir]/log from dspace.cfg\n" +
                        "-file [file name regex]\n" +
                            "\tOptional\n" +
                            "\tSpecify a regular expression as the file name template.\n" +
                            "\tCurrently this needs to be correctly escaped for Java string handling (FIXME)\n" +
                            "\tDefault uses dspace.log*\n" +
                        "-cfg [config file path]\n" +
                            "\tOptional\n" +
                            "\tSpecify a config file to be used\n" +
                            "\tDefault uses dstat.cfg in dspace config directory\n" +
                        "-out [output file path]\n" +
                            "\tOptional\n" +
                            "\tSpecify an output file to write results into\n" +
                            "\tDefault uses dstat.dat in dspace log directory\n" +
                        "-start [YYYY-MM-DD]\n" +
                            "\tOptional\n" +
                            "\tSpecify the start date of the analysis\n" +
                            "\tIf a start date is specified then no attempt to gather \n" +
                            "\tcurrent database statistics will be made unless -lookup is\n" +
                            "\talso passed\n" +
                            "\tDefault is to start from the earliest date records exist for\n" +
                        "-end [YYYY-MM-DD]\n" +
                            "\tOptional\n" +
                            "\tSpecify the end date of the analysis\n" +
                            "\tIf an end date is specified then no attempt to gather \n" +
                            "\tcurrent database statistics will be made unless -lookup is\n" +
                            "\talso passed\n" +
                            "\tDefault is to work up to the last date records exist for\n" +
                        "-lookup\n" +
                            "\tOptional\n" +
                            "\tForce a lookup of the current database statistics\n" +
                            "\tOnly needs to be used if date constraints are also in place\n" +
                        "-help\n" +
                            "\tdisplay this usage information\n";
        
        System.out.println(usage);
    }
}