/* * LogAnalyser.java * * Version: $Revision: 4735 $ * * Date: $Date: 2010-02-01 23:11:43 +0000 (Mon, 01 Feb 2010) $ * * Copyright (c) 2002-2009, The DSpace Foundation. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * - Neither the name of the DSpace Foundation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ package org.dspace.app.statistics; import org.dspace.app.statistics.LogLine; import org.dspace.core.ConfigurationManager; import org.dspace.core.Context; import org.dspace.core.LogManager; import org.dspace.storage.rdbms.DatabaseManager; import org.dspace.storage.rdbms.TableRow; import java.sql.SQLException; import java.lang.Long; import java.lang.StringBuffer; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.GregorianCalendar; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.StringTokenizer; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; /** * This class performs all the actual analysis of a given set of DSpace log * files. Most input can be configured; use the -help flag for a full list * of usage information. * * The output of this file is plain text and forms an "aggregation" file which * can then be used for display purposes using the related ReportGenerator * class. * * @author Richard Jones */ public class LogAnalyser { // set up our class globals // FIXME: there are so many of these perhaps they should exist in a static // object of their own ///////////////// // aggregators ///////////////// /** aggregator for all actions performed in the system */ private static Map actionAggregator; /** aggregator for all searches performed */ private static Map searchAggregator; /** aggregator for user logins */ private static Map userAggregator; /** aggregator for item views */ private static Map itemAggregator; /** aggregator for current archive state statistics */ private static Map archiveStats; /** warning counter */ private static int warnCount = 0; /** log line counter */ private static int lineCount = 0; ////////////////// // config data ////////////////// /** list of actions to be included in the general summary */ private static List generalSummary; /** list of words not to be aggregated */ private static List excludeWords; /** list of search types to be ignored, such as "author:" */ private static List excludeTypes; /** list of characters to be excluded */ private static List excludeChars; /** list of item types to be reported on in the current state */ private static List itemTypes; /** bottom limit to output for search word analysis */ private static int searchFloor; /** bottom limit to output for item view analysis */ private static int itemFloor; /** number of items from most popular to be looked up in the database */ private static int itemLookup; /** mode to use for user email display */ private static String userEmail; /** URL of the service being analysed */ private static String url; /** Name of the service being analysed */ private static String name; /** Name of the service being analysed */ private static String hostName; /** the average number of views per item */ private static int views = 0; /////////////////////// // regular expressions /////////////////////// /** Exclude characters regular expression pattern */ private static Pattern excludeCharRX = null; /** handle indicator string regular expression pattern */ private static Pattern handleRX = null; /** item id indicator string regular expression pattern */ private static Pattern itemRX = null; /** query string indicator regular expression pattern */ private static Pattern queryRX = null; /** collection indicator regular expression pattern */ private static Pattern collectionRX = null; /** community indicator regular expression pattern */ private static Pattern communityRX = null; /** results indicator regular expression pattern */ private static Pattern resultsRX = null; /** single character regular expression pattern */ private static Pattern singleRX = null; /** a pattern to match a valid version 1.3 log file line */ private static Pattern valid13 = null; /** a pattern to match a valid version 1.4 log file line */ private static Pattern valid14 = null; /** pattern to match valid log file names */ private static Pattern logRegex = null; /** pattern to match commented out lines from the config file */ private static Pattern comment = Pattern.compile("^#"); /** pattern to match genuine lines from the config file */ private static Pattern real = Pattern.compile("^(.+)=(.+)"); /** pattern to match all search types */ private static Pattern typeRX = null; /** pattern to match all search types */ private static Pattern wordRX = null; ////////////////////////// // Miscellaneous variables ////////////////////////// /** process timing clock */ private static Calendar startTime = null; ///////////////////////// // command line options //////////////////////// /** the log directory to be analysed */ private static String logDir = ConfigurationManager.getProperty("log.dir"); /** the regex to describe the file name format */ private static String fileTemplate = "dspace\\.log.*"; /** the config file from which to configure the analyser */ public static String configFile = ConfigurationManager.getProperty("dspace.dir") + File.separator + "config" + File.separator + "dstat.cfg"; /** the output file to which to write aggregation data */ private static String outFile = ConfigurationManager.getProperty("log.dir") + File.separator + "dstat.dat"; /** the starting date of the report */ private static Date startDate = null; /** the end date of the report */ private static Date endDate = null; /** the starting date of the report as obtained from the log files */ private static Date logStartDate = null; /** the end date of the report as obtained from the log files */ private static Date logEndDate = null; /** are we looking stuff up in the database */ private static boolean lookUp = false; /** * main method to be run from command line. See usage information for * details as to how to use the command line flags (-help) */ public static void main(String [] argv) throws Exception, SQLException { // first, start the processing clock startTime = new GregorianCalendar(); // create context as super user Context context = new Context(); context.setIgnoreAuthorization(true); // set up our command line variables String myLogDir = null; String myFileTemplate = null; String myConfigFile = null; String myOutFile = null; Date myStartDate = null; Date myEndDate = null; boolean myLookUp = false; // read in our command line options for (int i = 0; i < argv.length; i++) { if (argv[i].equals("-log")) { myLogDir = argv[i+1]; } if (argv[i].equals("-file")) { myFileTemplate = argv[i+1]; } if (argv[i].equals("-cfg")) { myConfigFile = argv[i+1]; } if (argv[i].equals("-out")) { myOutFile = argv[i+1]; } if (argv[i].equals("-help")) { LogAnalyser.usage(); System.exit(0); } if (argv[i].equals("-start")) { myStartDate = parseDate(argv[i+1]); } if (argv[i].equals("-end")) { myEndDate = parseDate(argv[i+1]); } if (argv[i].equals("-lookup")) { myLookUp = true; } } // now call the method which actually processes the logs processLogs(context, myLogDir, myFileTemplate, myConfigFile, myOutFile, myStartDate, myEndDate, myLookUp); } /** * using the pre-configuration information passed here, analyse the logs * and produce the aggregation file * * @param context the DSpace context object this occurs under * @param myLogDir the passed log directory. Uses default if null * @param myFileTemplate the passed file name regex. Uses default if null * @param myConfigFile the DStat config file. Uses default if null * @param myOutFile the file to which to output aggregation data. Uses default if null * @param myStartDate the desired start of the analysis. Starts from the beginning otherwise * @param myEndDate the desired end of the analysis. Goes to the end otherwise * @param myLookUp force a lookup of the database */ public static void processLogs(Context context, String myLogDir, String myFileTemplate, String myConfigFile, String myOutFile, Date myStartDate, Date myEndDate, boolean myLookUp) throws IOException, SQLException { // FIXME: perhaps we should have all parameters and aggregators put // together in a single aggregating object // if the timer has not yet been started, then start it startTime = new GregorianCalendar(); //instantiate aggregators actionAggregator = new HashMap(); searchAggregator = new HashMap(); userAggregator = new HashMap(); itemAggregator = new HashMap(); archiveStats = new HashMap(); //instantiate lists generalSummary = new ArrayList(); excludeWords = new ArrayList(); excludeTypes = new ArrayList(); excludeChars = new ArrayList(); itemTypes = new ArrayList(); // set the parameters for this analysis setParameters(myLogDir, myFileTemplate, myConfigFile, myOutFile, myStartDate, myEndDate, myLookUp); // pre prepare our standard file readers and buffered readers FileReader fr = null; BufferedReader br = null; // read in the config information, throwing an error if we fail to open // the given config file readConfig(configFile); // assemble the regular expressions for later use (requires the file // template to build the regex to match it setRegex(fileTemplate); // get the log files File[] logFiles = getLogFiles(logDir); // standard loop counter int i = 0; // for every log file do analysis // FIXME: it is easy to implement not processing log files after the // dates exceed the end boundary, but is there an easy way to do it // for the start of the file? Note that we can assume that the contents // of the log file are sequential, but can we assume the files are // provided in a data sequence? for (i = 0; i < logFiles.length; i++) { // check to see if this file is a log file agains the global regex Matcher matchRegex = logRegex.matcher(logFiles[i].getName()); if (matchRegex.matches()) { // if it is a log file, open it up and lets have a look at the // contents. try { fr = new FileReader(logFiles[i].toString()); br = new BufferedReader(fr); } catch (IOException e) { System.out.println("Failed to read log file " + logFiles[i].toString()); System.exit(0); } // for each line in the file do the analysis // FIXME: perhaps each section needs to be dolled out to an // analysing class to allow pluggability of other methods of // analysis, and ease of code reading too - Pending further thought String line = null; while ((line = br.readLine()) != null) { // get the log line object LogLine logLine = getLogLine(line); // if there are line segments get on with the analysis if (logLine != null) { // first find out if we are constraining by date and // if so apply the restrictions if ((startDate != null) && (!logLine.afterDate(startDate))) { continue; } if ((endDate !=null) && (!logLine.beforeDate(endDate))) { break; } // count the number of lines parsed lineCount++; // if we are not constrained by date, register the date // as the start/end date if it is the earliest/latest so far // FIXME: this should probably have a method of its own if (startDate == null) { if (logStartDate != null) { if (logLine.beforeDate(logStartDate)) { logStartDate = logLine.getDate(); } } else { logStartDate = logLine.getDate(); } } if (endDate == null) { if (logEndDate != null) { if (logLine.afterDate(logEndDate)) { logEndDate = logLine.getDate(); } } else { logEndDate = logLine.getDate(); } } // count the warnings if (logLine.isLevel("WARN")) { // FIXME: really, this ought to be some kind of level // aggregator warnCount++; } // is the action a search? if (logLine.isAction("search")) { // get back all the valid search words from the query String[] words = analyseQuery(logLine.getParams()); // for each search word add to the aggregator or // increment the aggregator's counter for (int j = 0; j < words.length; j++) { // FIXME: perhaps aggregators ought to be objects // themselves searchAggregator.put(words[j], increment(searchAggregator, words[j])); } } // is the action a login, and are we counting user logins? if (logLine.isAction("login") && !userEmail.equals("off")) { userAggregator.put(logLine.getUser(), increment(userAggregator, logLine.getUser())); } // is the action an item view? if (logLine.isAction("view_item")) { String handle = logLine.getParams(); // strip the handle string Matcher matchHandle = handleRX.matcher(handle); handle = matchHandle.replaceAll(""); // strip the item id string Matcher matchItem = itemRX.matcher(handle); handle = matchItem.replaceAll(""); handle.trim(); // either add the handle to the aggregator or // increment its counter itemAggregator.put(handle, increment(itemAggregator, handle)); } // log all the activity actionAggregator.put(logLine.getAction(), increment(actionAggregator, logLine.getAction())); } } // close the file reading buffers br.close(); fr.close(); } } // do we want to do a database lookup? Do so only if the start and // end dates are null or lookUp is true // FIXME: this is a kind of separate section. Would it be worth building // the summary string separately and then inserting it into the real // summary later? Especially if we make the archive analysis more complex archiveStats.put("All Items", getNumItems(context)); for (i = 0; i < itemTypes.size(); i++) { archiveStats.put(itemTypes.get(i), getNumItems(context, (String) itemTypes.get(i))); } // now do the host name and url lookup hostName = ConfigurationManager.getProperty("dspace.hostname").trim(); name = ConfigurationManager.getProperty("dspace.name").trim(); url = ConfigurationManager.getProperty("dspace.url").trim(); if ((url != null) && (!url.endsWith("/"))) { url = url + "/"; } // do the average views analysis if (((Integer) archiveStats.get("All Items")).intValue() != 0) { // FIXME: this is dependent on their being a query on the db, which // there might not always be if it becomes configurable Double avg = new Double( Math.ceil( ((Integer) actionAggregator.get("view_item")).intValue() / ((Integer) archiveStats.get("All Items")).intValue())); views = avg.intValue(); } // finally, write the output createOutput(); return; } /** * set the passed parameters up as global class variables. This has to * be done in a separate method because the API permits for running from * the command line with args or calling the processLogs method statically * from elsewhere * * @param myLogDir the log file directory to be analysed * @param myFileTemplate regex for log file names * @param myConfigFile config file to use for dstat * @param myOutFile file to write the aggregation into * @param myStartDate requested log reporting start date * @param myEndDate requested log reporting end date * @param myLookUp requested look up force flag */ public static void setParameters(String myLogDir, String myFileTemplate, String myConfigFile, String myOutFile, Date myStartDate, Date myEndDate, boolean myLookUp) { if (myLogDir != null) { logDir = myLogDir; } if (myFileTemplate != null) { fileTemplate = myFileTemplate; } if (myConfigFile != null) { configFile = myConfigFile; } if (myStartDate != null) { startDate = myStartDate; } if (myEndDate != null) { endDate = myEndDate; } if (myLogDir != null) { lookUp = myLookUp; } if (myOutFile != null) { outFile = myOutFile; } return; } /** * generate the analyser's output to the specified out file */ public static void createOutput() { // start a string buffer to hold the final output StringBuffer summary = new StringBuffer(); // define an iterator that will be used to go over the hashmap keys Iterator keys = null; // output the number of lines parsed summary.append("log_lines=" + Integer.toString(lineCount) + "\n"); // output the number of warnings encountered summary.append("warnings=" + Integer.toString(warnCount) + "\n"); // set the general summary config up in the aggregator file for (int i = 0; i < generalSummary.size(); i++) { summary.append("general_summary=" + generalSummary.get(i) + "\n"); } // output the host name summary.append("server_name=" + hostName + "\n"); // output the service name summary.append("service_name=" + name + "\n"); // output the date information if necessary SimpleDateFormat sdf = new SimpleDateFormat("dd'/'MM'/'yyyy"); if (startDate != null) { summary.append("start_date=" + sdf.format(startDate) + "\n"); } else if (logStartDate != null) { summary.append("start_date=" + sdf.format(logStartDate) + "\n"); } if (endDate != null) { summary.append("end_date=" + sdf.format(endDate) + "\n"); } else if (logEndDate != null) { summary.append("end_date=" + sdf.format(logEndDate) + "\n"); } // write out the archive stats keys = archiveStats.keySet().iterator(); while (keys.hasNext()) { String key = (String) keys.next(); summary.append("archive." + key + "=" + archiveStats.get(key) + "\n"); } // write out the action aggregation results keys = actionAggregator.keySet().iterator(); while (keys.hasNext()) { String key = (String) keys.next(); summary.append("action." + key + "=" + actionAggregator.get(key) + "\n"); } // depending on the config settings for reporting on emails output the // login information summary.append("user_email=" + userEmail + "\n"); int address = 1; keys = userAggregator.keySet().iterator(); // for each email address either write out the address and the count // or alias it with an "Address X" label, to keep the data confidential // FIXME: the users reporting should also have a floor value while (keys.hasNext()) { String key = (String) keys.next(); summary.append("user."); if (userEmail.equals("on")) { summary.append(key + "=" + userAggregator.get(key) + "\n"); } else if (userEmail.equals("alias")) { summary.append("Address " + Integer.toString(address++) + "=" + userAggregator.get(key) + "\n"); } } // FIXME: all values which have floors set should provide an "other" // record which counts how many other things which didn't make it into // the listing there are // output the search word information summary.append("search_floor=" + searchFloor + "\n"); keys = searchAggregator.keySet().iterator(); while (keys.hasNext()) { String key = (String) keys.next(); if (((Integer) searchAggregator.get(key)).intValue() >= searchFloor) { summary.append("search." + key + "=" + searchAggregator.get(key) + "\n"); } } // FIXME: we should do a lot more with the search aggregator // Possible feature list: // - constrain by collection/community perhaps? // - we should consider building our own aggregator class which can // be full of rich data. Perhaps this and the Stats class should // be the same thing. // item viewing information summary.append("item_floor=" + itemFloor + "\n"); summary.append("host_url=" + url + "\n"); summary.append("item_lookup=" + itemLookup + "\n"); // write out the item access information keys = itemAggregator.keySet().iterator(); while (keys.hasNext()) { String key = (String) keys.next(); if (((Integer) itemAggregator.get(key)).intValue() >= itemFloor) { summary.append("item." + key + "=" + itemAggregator.get(key) + "\n"); } } // output the average views per item if (views > 0) { summary.append("avg_item_views=" + views + "\n"); } // insert the analysis processing time information Calendar endTime = new GregorianCalendar(); long timeInMillis = (endTime.getTimeInMillis() - startTime.getTimeInMillis()); summary.append("analysis_process_time=" + Long.toString(timeInMillis / 1000) + "\n"); // finally write the string into the output file try { BufferedWriter out = new BufferedWriter(new FileWriter(outFile)); out.write(summary.toString()); out.flush(); out.close(); } catch (IOException e) { System.out.println("Unable to write to output file " + outFile); System.exit(0); } return; } /** * get an array of file objects representing the passed log directory * * @param logDir the log directory in which to pick up files * * @return an array of file objects representing the given logDir */ public static File[] getLogFiles(String logDir) { // open the log files directory, read in the files, check that they // match the passed regular expression then analyse the content File logs = new File(logDir); // if log dir is not a directory throw and error and exit if (!logs.isDirectory()) { System.out.println("Passed log directory is not a directory"); System.exit(0); } // get the files in the directory return logs.listFiles(); } /** * set up the regular expressions to be used by this analyser. Mostly this * exists to provide a degree of segregation and readability to the code * and to ensure that you only need to set up the regular expressions to * be used once * * @param fileTemplate the regex to be used to identify dspace log files */ public static void setRegex(String fileTemplate) { // build the exclude characters regular expression StringBuffer charRegEx = new StringBuffer(); charRegEx.append("["); for (int i = 0; i < excludeChars.size(); i++) { charRegEx.append("\\" + (String) excludeChars.get(i)); } charRegEx.append("]"); excludeCharRX = Pattern.compile(charRegEx.toString()); // regular expression to find handle indicators in strings handleRX = Pattern.compile("handle="); // regular expression to find item_id indicators in strings itemRX = Pattern.compile(",item_id=.*$"); // regular expression to find query indicators in strings queryRX = Pattern.compile("query="); // regular expression to find collections in strings collectionRX = Pattern.compile("collection_id=[0-9]*,"); // regular expression to find communities in strings communityRX = Pattern.compile("community_id=[0-9]*,"); // regular expression to find search result sets resultsRX = Pattern.compile(",results=(.*)"); // regular expressions to find single characters anywhere in the string singleRX = Pattern.compile("( . |^. | .$)"); // set up the standard log file line regular expression String logLine13 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+):[^:]+:([^:]+):(.*)"; String logLine14 = "^(\\d\\d\\d\\d-\\d\\d\\-\\d\\d) \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d (\\w+)\\s+\\S+ @ ([^:]+):[^:]+:[^:]+:([^:]+):(.*)"; valid13 = Pattern.compile(logLine13); valid14 = Pattern.compile(logLine14); // set up the pattern for validating log file names logRegex = Pattern.compile(fileTemplate); // set up the pattern for matching any of the query types StringBuffer typeRXString = new StringBuffer(); typeRXString.append("("); for (int i = 0; i < excludeTypes.size(); i++) { if (i > 0) { typeRXString.append("|"); } typeRXString.append((String) excludeTypes.get(i)); } typeRXString.append(")"); typeRX = Pattern.compile(typeRXString.toString()); // set up the pattern for matching any of the words to exclude StringBuffer wordRXString = new StringBuffer(); wordRXString.append("("); for (int i = 0; i < excludeWords.size(); i++) { if (i > 0) { wordRXString.append("|"); } wordRXString.append(" " + (String) excludeWords.get(i) + " "); wordRXString.append("|"); wordRXString.append("^" + (String) excludeWords.get(i) + " "); wordRXString.append("|"); wordRXString.append(" " + (String) excludeWords.get(i) + "$"); } wordRXString.append(")"); wordRX = Pattern.compile(wordRXString.toString()); return; } /** * read in the given config file and populate the class globals * * @param configFile the config file to read in */ public static void readConfig(String configFile) throws IOException { //instantiate aggregators actionAggregator = new HashMap(); searchAggregator = new HashMap(); userAggregator = new HashMap(); itemAggregator = new HashMap(); archiveStats = new HashMap(); //instantiate lists generalSummary = new ArrayList(); excludeWords = new ArrayList(); excludeTypes = new ArrayList(); excludeChars = new ArrayList(); itemTypes = new ArrayList(); // prepare our standard file readers and buffered readers FileReader fr = null; BufferedReader br = null; String record = null; try { fr = new FileReader(configFile); br = new BufferedReader(fr); } catch (IOException e) { System.out.println("Failed to read config file: " + configFile); System.exit(0); } // read in the config file and set up our instance variables while ((record = br.readLine()) != null) { // check to see what kind of line we have Matcher matchComment = comment.matcher(record); Matcher matchReal = real.matcher(record); // if the line is not a comment and is real, read it in if (!matchComment.matches() && matchReal.matches()) { // lift the values out of the matcher's result groups String key = matchReal.group(1).trim(); String value = matchReal.group(2).trim(); // read the config values into our instance variables (see // documentation for more info on config params) if (key.equals("general.summary")) { actionAggregator.put(value, new Integer(0)); generalSummary.add(value); } if (key.equals("exclude.word")) { excludeWords.add(value); } if (key.equals("exclude.type")) { excludeTypes.add(value); } if (key.equals("exclude.character")) { excludeChars.add(value); } if (key.equals("item.type")) { itemTypes.add(value); } if (key.equals("item.floor")) { itemFloor = Integer.parseInt(value); } if (key.equals("search.floor")) { searchFloor = Integer.parseInt(value); } if (key.equals("item.lookup")) { itemLookup = Integer.parseInt(value); } if (key.equals("user.email")) { userEmail = value; } } } // close the inputs br.close(); fr.close(); return; } /** * increment the value of the given map at the given key by one. * * @param map the map whose value we want to increase * @param key the key of the map whose value to increase * * @return an integer object containing the new value */ public static Integer increment(Map map, String key) { Integer newValue = null; if (map.containsKey(key)) { // FIXME: this seems like a ridiculous way to add Integers newValue = new Integer(((Integer) map.get(key)).intValue() + 1); } else { newValue = new Integer(1); } return newValue; } /** * Take the standard date string requested at the command line and convert * it into a Date object. Throws and error and exits if the date does * not parse * * @param date the string representation of the date * * @return a date object containing the date, with the time set to * 00:00:00 */ public static Date parseDate(String date) { SimpleDateFormat sdf = new SimpleDateFormat("yyyy'-'MM'-'dd"); Date parsedDate = null; try { parsedDate = sdf.parse(date); } catch (ParseException e) { System.out.println("The date is not in the correct format"); System.exit(0); } return parsedDate; } /** * Take the date object and convert it into a string of the form YYYY-MM-DD * * @param date the date to be converted * * @return A string of the form YYYY-MM-DD */ public static String unParseDate(Date date) { // Use SimpleDateFormat SimpleDateFormat sdf = new SimpleDateFormat("yyyy'-'MM'-'dd"); return sdf.format(date); } /** * Take a search query string and pull out all of the meaningful information * from it, giving the results in the form of a String array, a single word * to each element * * @param query the search query to be analysed * * @return the string array containing meaningful search terms */ public static String[] analyseQuery(String query) { // register our standard loop counter int i = 0; // make the query string totally lower case, to ensure we don't miss out // on matches due to capitalisation query = query.toLowerCase(); // now perform successive find and replace operations using pre-defined // global regular expressions Matcher matchQuery = queryRX.matcher(query); query = matchQuery.replaceAll(" "); Matcher matchCollection = collectionRX.matcher(query); query = matchCollection.replaceAll(" "); Matcher matchCommunity = communityRX.matcher(query); query = matchCommunity.replaceAll(" "); Matcher matchResults = resultsRX.matcher(query); query = matchResults.replaceAll(" "); Matcher matchTypes = typeRX.matcher(query); query = matchTypes.replaceAll(" "); Matcher matchChars = excludeCharRX.matcher(query); query = matchChars.replaceAll(" "); Matcher matchWords = wordRX.matcher(query); query = matchWords.replaceAll(" "); Matcher single = singleRX.matcher(query); query = single.replaceAll(" "); // split the remaining string by whitespace, trim and stuff into an // array to be returned StringTokenizer st = new StringTokenizer(query); String[] words = new String[st.countTokens()]; for (i = 0; i < words.length; i++) { words[i] = st.nextToken().trim(); } // FIXME: some single characters are still slipping through the net; // why? and how do we fix it? return words; } /** * split the given line into it's relevant segments if applicable (i.e. the * line matches the required regular expression. * * @param line the line to be segmented * @return a Log Line object for the given line */ public static LogLine getLogLine(String line) { // FIXME: consider moving this code into the LogLine class. To do this // we need to much more carefully define the structure and behaviour // of the LogLine class Matcher match; if (line.indexOf(":ip_addr") > 0) { match = valid14.matcher(line); } else { match = valid13.matcher(line); } if (match.matches()) { // set up a new log line object LogLine logLine = new LogLine(parseDate(match.group(1).trim()), LogManager.unescapeLogField(match.group(2)).trim(), LogManager.unescapeLogField(match.group(3)).trim(), LogManager.unescapeLogField(match.group(4)).trim(), LogManager.unescapeLogField(match.group(5)).trim()); return logLine; } else { return null; } } /** * get the number of items in the archive which were accessioned between * the provided start and end dates, with the given value for the DC field * 'type' (unqualified) * * @param context the DSpace context for the action * @param type value for DC field 'type' (unqualified) * * @return an integer containing the relevant count */ public static Integer getNumItems(Context context, String type) throws SQLException { boolean oracle = false; if ("oracle".equals(ConfigurationManager.getProperty("db.name"))) { oracle = true; } // FIXME: this method is clearly not optimised // FIXME: we don't yet collect total statistics, such as number of items // withdrawn, number in process of submission etc. We should probably do // that // start the type constraint String typeQuery = null; if (type != null) { typeQuery = "SELECT item_id " + "FROM metadatavalue " + "WHERE text_value LIKE '%" + type + "%' " + "AND metadata_field_id = (" + " SELECT metadata_field_id " + " FROM metadatafieldregistry " + " WHERE element = 'type' " + " AND qualifier IS NULL) "; } // start the date constraint query buffer StringBuffer dateQuery = new StringBuffer(); if (oracle) { dateQuery.append("SELECT /*+ ORDERED_PREDICATES */ item_id "); } else { dateQuery.append("SELECT item_id "); } dateQuery.append("FROM metadatavalue " + "WHERE metadata_field_id = (" + " SELECT metadata_field_id " + " FROM metadatafieldregistry " + " WHERE element = 'date' " + " AND qualifier = 'accessioned') "); if (startDate != null) { if (oracle) { dateQuery.append(" AND TO_TIMESTAMP( TO_CHAR(text_value), "+ "'yyyy-mm-dd\"T\"hh24:mi:ss\"Z\"' ) > TO_DATE('" + unParseDate(startDate) + "', 'yyyy-MM-dd') "); } else { dateQuery.append(" AND text_value::timestamp > '" + unParseDate(startDate) + "'::timestamp "); } } if (endDate != null) { if (oracle) { dateQuery.append(" AND TO_TIMESTAMP( TO_CHAR(text_value), "+ "'yyyy-mm-dd\"T\"hh24:mi:ss\"Z\"' ) < TO_DATE('" + unParseDate(endDate) + "', 'yyyy-MM-dd') "); } else { dateQuery.append(" AND text_value::timestamp < '" + unParseDate(endDate) + "'::timestamp "); } } // build the final query StringBuffer query = new StringBuffer(); query.append("SELECT COUNT(*) AS num " + "FROM item " + "WHERE in_archive = " + (oracle ? "1 " : "true ") + "AND withdrawn = " + (oracle ? "0 " : "false ")); if (startDate != null || endDate != null) { query.append(" AND item_id IN ( " + dateQuery.toString() + ") "); } if (type != null) { query.append(" AND item_id IN ( " + typeQuery + ") "); } TableRow row = DatabaseManager.querySingle(context, query.toString()); Integer numItems; if (oracle) { numItems = new Integer(row.getIntColumn("num")); } else { // for some reason the number column is of "long" data type! Long count = new Long(row.getLongColumn("num")); numItems = new Integer(count.intValue()); } return numItems; } /** * get the total number of items in the archive at time of execution, * ignoring all other constraints * * @param context the DSpace context the action is being performed in * * @return an Integer containing the number of items in the * archive */ public static Integer getNumItems(Context context) throws SQLException { return getNumItems(context, null); } /** * print out the usage information for this class to the standard out */ public static void usage() { String usage = "Usage Information:\n" + "LogAnalyser [options [parameters]]\n" + "-log [log directory]\n" + "\tOptional\n" + "\tSpecify a directory containing log files\n" + "\tDefault uses [dspace.dir]/log from dspace.cfg\n" + "-file [file name regex]\n" + "\tOptional\n" + "\tSpecify a regular expression as the file name template.\n" + "\tCurrently this needs to be correctly escaped for Java string handling (FIXME)\n" + "\tDefault uses dspace.log*\n" + "-cfg [config file path]\n" + "\tOptional\n" + "\tSpecify a config file to be used\n" + "\tDefault uses dstat.cfg in dspace config directory\n" + "-out [output file path]\n" + "\tOptional\n" + "\tSpecify an output file to write results into\n" + "\tDefault uses dstat.dat in dspace log directory\n" + "-start [YYYY-MM-DD]\n" + "\tOptional\n" + "\tSpecify the start date of the analysis\n" + "\tIf a start date is specified then no attempt to gather \n" + "\tcurrent database statistics will be made unless -lookup is\n" + "\talso passed\n" + "\tDefault is to start from the earliest date records exist for\n" + "-end [YYYY-MM-DD]\n" + "\tOptional\n" + "\tSpecify the end date of the analysis\n" + "\tIf an end date is specified then no attempt to gather \n" + "\tcurrent database statistics will be made unless -lookup is\n" + "\talso passed\n" + "\tDefault is to work up to the last date records exist for\n" + "-lookup\n" + "\tOptional\n" + "\tForce a lookup of the current database statistics\n" + "\tOnly needs to be used if date constraints are also in place\n" + "-help\n" + "\tdisplay this usage information\n"; System.out.println(usage); } }