/*
* StatisticsImporter.java
*
* Version: $Revision: 4882 $
*
* Date: $Date: 2010-05-05 02:03:35 +0000 (Wed, 05 May 2010) $
*
* Copyright (c) 2002-2010, The DSpace Foundation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* - Neither the name of the DSpace Foundation nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
package org.dspace.statistics.util;
import org.apache.commons.cli.*;
import org.apache.commons.lang.time.DateFormatUtils;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.dspace.content.*;
import org.dspace.content.Collection;
import org.dspace.core.Context;
import org.dspace.core.Constants;
import org.dspace.core.ConfigurationManager;
import org.dspace.eperson.EPerson;
import org.dspace.statistics.SolrLogger;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.io.*;
import java.util.*;
import com.maxmind.geoip.LookupService;
import com.maxmind.geoip.Location;
/**
* Class to load intermediate statistics files into solr
*
* @author Stuart Lewis
*/
public class StatisticsImporter
{
/** Date format (for solr) */
private static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
/** Solr server connection */
private static CommonsHttpSolrServer solr;
/** GEOIP lookup service */
private static LookupService geoipLookup;
/** Metadata storage information */
private static Map metadataStorageInfo;
/** Whether to skip the DNS reverse lookup or not */
private static boolean skipReverseDNS = false;
/** Local items */
private Vector<Integer> localItems;
/** Local collections */
private Vector<Integer> localCollections;
/** Local communities */
private Vector<Integer> localCommunities;
/** Local bitstreams */
private Vector<Integer> localBitstreams;
/** Whether or not to replace item IDs with local values (for testing) */
private boolean useLocal;
/**
* Constructor. Optionally loads local data to replace foreign data
* if using someone else's log files
*
* @param local Whether to use local data
*/
public StatisticsImporter(boolean local)
{
// Setup the lists of communities, collections, items & bitstreams if required
useLocal = local;
if (local)
{
try
{
System.out.print("Loading local communities... ");
Context c = new Context();
Community[] communities = Community.findAll(c);
localCommunities = new Vector<Integer>();
for (Community community : communities)
{
localCommunities.add(community.getID());
}
System.out.println("Found " + localCommunities.size());
System.out.print("Loading local collections... ");
Collection[] collections = Collection.findAll(c);
localCollections = new Vector<Integer>();
for (Collection collection : collections)
{
localCollections.add(collection.getID());
}
System.out.println("Found " + localCollections.size());
System.out.print("Loading local items... ");
ItemIterator items = Item.findAll(c);
localItems = new Vector<Integer>();
Item i;
while (items.hasNext())
{
i = items.next();
localItems.add(i.getID());
}
System.out.println("Found " + localItems.size());
System.out.print("Loading local bitstreams... ");
Bitstream[] bitstreams = Bitstream.findAll(c);
localBitstreams = new Vector<Integer>();
for (Bitstream bitstream : bitstreams)
{
if (bitstream.getName() != null)
{
localBitstreams.add(bitstream.getID());
}
}
System.out.println("Found " + localBitstreams.size());
} catch (Exception e)
{
System.err.println("Error retrieving items from DSpace database:");
e.printStackTrace();
System.exit(1);
}
}
}
/**
* Method to load the lines from the statics file and load them into solr
*
* @param filename The filename of the file to load
* @param context The DSpace Context
* @param verbose Whether to display verbose output
*/
private void load(String filename, Context context, boolean verbose)
{
// Print out the filename for confirmation
System.out.println("Processing file: " + filename);
// Item counter
int counter = 0;
int errors = 0;
int searchengines = 0;
try
{
BufferedReader input = new BufferedReader(new FileReader(new File(filename)));
String line;
String uuid;
String action;
String id;
Date date;
String user;
String ip;
String continent = "";
String country = "";
String countryCode = "";
float longitude = 0f;
float latitude = 0f;
String city = "";
String dns;
DNSCache dnsCache = new DNSCache(2500, 0.75f, 2500);
Object fromCache;
Random rand = new Random();
while ((line = input.readLine()) != null)
{
// Tokenise the line
String data = "";
counter++;
errors++;
if (verbose) System.out.println("Line:" + line);
String[] parts = line.split(",");
uuid = parts[0];
action = parts[1];
id = parts[2];
date = dateFormat.parse(parts[3]);
user = parts[4];
ip = parts[5];
// Resolve the dns (if applicable) to get rid of search engine bots early on in the processing chain
dns = "";
if (!skipReverseDNS)
{
// Is the IP address in the cache?
fromCache = dnsCache.get(ip);
if (fromCache != null)
{
dns = (String)fromCache;
}
else
{
try
{
dns = DnsLookup.reverseDns(ip);
dnsCache.put(ip, dns);
} catch (Exception e)
{
dns = "";
}
}
}
data += ("ip addr = " + ip);
data += (", dns name = " + dns);
if ((dns.endsWith(".googlebot.com.")) ||
(dns.endsWith(".crawl.yahoo.net.")) ||
(dns.endsWith(".search.msn.com.")))
{
if (verbose) System.out.println(data + ", IGNORE (search engine)");
errors--;
searchengines++;
continue;
}
// Get the geo information for the user
Location location;
try {
location = geoipLookup.getLocation(ip);
city = location.city;
country = location.countryName;
countryCode = location.countryCode;
longitude = location.longitude;
latitude = location.latitude;
if(verbose) {
data += (", country = " + country);
data += (", city = " + city);
System.out.println(data);
}
try {
continent = LocationUtils.getContinentCode(countryCode);
} catch (Exception e) {
if (verbose) System.out.println("Unknown country code: " + countryCode);
continue;
}
} catch (Exception e) {
// No problem - just can't look them up
}
// Now find our dso
int type = 0;
if ("view_bitstream".equals(action))
{
type = Constants.BITSTREAM;
if (useLocal)
{
id = "" + localBitstreams.get(rand.nextInt(localBitstreams.size()));
}
}
else if ("view_item".equals(action))
{
type = Constants.ITEM;
if (useLocal)
{
id = "" + localItems.get(rand.nextInt(localItems.size()));
}
}
else if ("view_collection".equals(action))
{
type = Constants.COLLECTION;
if (useLocal)
{
id = "" + localCollections.get(rand.nextInt(localCollections.size()));
}
}
else if ("view_community".equals(action))
{
type = Constants.COMMUNITY;
if (useLocal)
{
id = "" + localCommunities.get(rand.nextInt(localCommunities.size()));
}
}
DSpaceObject dso = DSpaceObject.find(context, type, Integer.parseInt(id));
if (dso == null)
{
if (verbose) System.err.println(" - DSO with ID '" + id + "' is no longer in the system");
continue;
}
// Get the eperson details
EPerson eperson = EPerson.findByEmail(context, user);
int epersonId = 0;
if (eperson != null) eperson.getID();
// Save it in our server
SolrInputDocument sid = new SolrInputDocument();
sid.addField("ip", ip);
sid.addField("type", dso.getType());
sid.addField("id", dso.getID());
sid.addField("time", DateFormatUtils.format(date, SolrLogger.DATE_FORMAT_8601));
sid.addField("continent", continent);
sid.addField("country", country);
sid.addField("countryCode", countryCode);
sid.addField("city", city);
sid.addField("latitude", latitude);
sid.addField("longitude", longitude);
if (epersonId > 0)
sid.addField("epersonid", epersonId);
if (dns != null)
sid.addField("dns", dns.toLowerCase());
if (dso instanceof Item) {
Item item = (Item) dso;
// Store the metadata
for (Object storedField : metadataStorageInfo.keySet()) {
String dcField = (String) metadataStorageInfo
.get(storedField);
DCValue[] vals = item.getMetadata(dcField.split("\\.")[0],
dcField.split("\\.")[1], dcField.split("\\.")[2],
Item.ANY);
for (DCValue val1 : vals) {
String val = val1.value;
sid.addField(String.valueOf(storedField), val);
sid.addField(String.valueOf(storedField + "_search"),
val.toLowerCase());
}
}
}
SolrLogger.storeParents(sid, dso);
solr.add(sid);
errors--;
}
} catch (Exception e)
{
System.err.println(e.getMessage());
e.printStackTrace();
}
DecimalFormat percentage = new DecimalFormat("##.###");
int committed = counter - errors - searchengines;
System.out.println("Processed " + counter + " log lines");
if (counter > 0)
{
Double committedpercentage = 100d * committed / counter;
System.out.println(" - " + committed + " entries added to solr: " + percentage.format(committedpercentage) + "%");
Double errorpercentage = 100d * errors / counter;
System.out.println(" - " + errors + " errors: " + percentage.format(errorpercentage) + "%");
Double sepercentage = 100d * searchengines / counter;
System.out.println(" - " + searchengines + " search engine activity skipped: " + percentage.format(sepercentage) + "%");
System.out.print("About to commit data to solr...");
// Commit at the end because it takes a while
try
{
solr.commit();
}
catch (SolrServerException sse)
{
System.err.println("Error committing statistics to solr server!");
sse.printStackTrace();
System.exit(1);
}
catch (IOException ioe)
{
System.err.println("Error writing to solr server!");
ioe.printStackTrace();
System.exit(1);
}
}
System.out.println(" done!");
}
/**
* Print the help message
*
* @param options The command line options the user gave
* @param exitCode the system exit code to use
*/
private static void printHelp(Options options, int exitCode)
{
// print the help message
HelpFormatter myhelp = new HelpFormatter();
myhelp.printHelp("StatisticsImporter\n", options);
System.exit(exitCode);
}
/**
* Main method to run the statistics importer.
*
* @param args The command line arguments
* @throws Exception If something goes wrong
*/
public static void main(String[] args) throws Exception
{
CommandLineParser parser = new PosixParser();
Options options = new Options();
options.addOption("i", "in", true, "the inpout file");
options.addOption("l", "local", false, "developers tool - map external log file to local handles");
options.addOption("m", "multiple", false, "treat the input file as having a wildcard ending");
options.addOption("s", "skipdns", false, "skip performing reverse DNS lookups on IP addresses");
options.addOption("v", "verbose", false, "display verbose output (useful for debugging)");
options.addOption("h", "help", false, "help");
CommandLine line = parser.parse(options, args);
// Did the user ask to see the help?
if (line.hasOption('h'))
{
printHelp(options, 0);
}
if (!line.hasOption('i'))
{
System.err.println("You must specify an input file using the -i flag");
printHelp(options, 1);
}
if (line.hasOption('s'))
{
skipReverseDNS = true;
}
// Whether or not to convert handles to handles used in a local system
// (useful if using someone else's log file for testing)
boolean local = line.hasOption('l');
// We got all our parameters now get the rest
Context context = new Context();
// Verbose option
boolean verbose = line.hasOption('v');
// Find our solrserver
String sserver = ConfigurationManager.getProperty("solr.log.server");
if (verbose) System.out.println("Writing to solr server at: " + sserver);
solr = new CommonsHttpSolrServer(sserver);
metadataStorageInfo = SolrLogger.getMetadataStorageInfo();
String dbfile = ConfigurationManager.getProperty("solr.dbfile");
geoipLookup = new LookupService(dbfile, LookupService.GEOIP_STANDARD);
StatisticsImporter si = new StatisticsImporter(local);
if (line.hasOption('m'))
{
// Convert all the files
final File sample = new File(line.getOptionValue('i'));
File dir = sample.getParentFile();
FilenameFilter filter = new FilenameFilter()
{
public boolean accept(File dir, String name)
{
return name.startsWith(sample.getName());
}
};
String[] children = dir.list(filter);
for (String in : children)
{
System.out.println(in);
si.load(dir.getAbsolutePath() + System.getProperty("file.separator") + in, context, verbose);
}
}
else
{
// Just convert the one file
si.load(line.getOptionValue('i'), context, verbose);
}
}
/**
* Inner class to hold a cache of reverse lookups of IP addresses
* @param <K>
* @param <V>
*/
class DNSCache<K,V> extends LinkedHashMap<K,V>
{
private int maxCapacity;
public DNSCache(int initialCapacity, float loadFactor, int maxCapacity)
{
super(initialCapacity, loadFactor, true);
this.maxCapacity = maxCapacity;
}
@Override
protected boolean removeEldestEntry(java.util.Map.Entry<K,V> eldest)
{
return size() >= this.maxCapacity;
}
}
}