/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.core.benchmarks.memtime;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryPoolMXBean;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.carrot2.core.Controller;
import org.carrot2.core.ControllerFactory;
import org.carrot2.core.Document;
import org.carrot2.core.IProcessingComponent;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.util.CloseableUtils;
import org.junit.BeforeClass;
import org.simpleframework.xml.ElementList;
import org.simpleframework.xml.Root;
import org.simpleframework.xml.core.Persister;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.carrot2.shaded.guava.common.collect.Maps;
/**
* Compute approximate memory and time characteristic for a given algorithm and input.
*/
@SuppressWarnings("unused")
public class MemTimeBenchmark
{
/**
* Due to class renames (LanguageCode changed its package) the logged XML files don't
* deserialize properly. This is a stub wrapper that only reads documents.
*/
@Root(name = "searchresult", strict = false)
public static class ResponseWrapper
{
@ElementList(inline = true, required = false)
public List<Document> documents;
}
/**
* A list of input documents. These documents should be some real-life input snippets
* (preferably short, because longer input is simulated by concatenating documents
* together).
*/
protected static ArrayList<Document> documents = new ArrayList<Document>();
/**
* Directory with input XML files. Files should be organized using file name
* convention:
*
* <pre>
* response - xxxxx.xml
* </pre>
*
* where <code>xxxxx</code> is a sequential number starting from 0.
*/
private static File inputFilesDir;
/**
* Folder for the output log files.
*/
private static File outputFilesDir;
/**
* Maximum number of input files to read from disk.
*/
private static int MAX_FILES;
/**
* Minimum documents to cluster.
*/
protected static int MIN;
/**
* Maximum documents to cluster.
*/
protected static int MAX;
/**
* Increment step for the documents to cluster range.
*/
protected static int STEP;
/**
* The controller used to drive the clustering process.
*/
private static Controller controller;
/**
* Override defaults with system properties.
*/
public static void overrideDefaults()
{
inputFilesDir = new File(System.getProperty("inputFilesDir", "input"));
outputFilesDir = new File(System.getProperty("outputFilesDir", "tmp"));
MAX_FILES = Integer.parseInt(System.getProperty("MAX_FILES", "200"));
MIN = Integer.parseInt(System.getProperty("MIN", "100"));
MAX = Integer.parseInt(System.getProperty("MAX", "20000"));
STEP = Integer.parseInt(System.getProperty("STEP", "100"));
}
/**
* Populate {@link #documents}.
*/
public static void readData() throws Exception
{
Persister p = new Persister();
for (int i = 0; i < MAX_FILES; i++)
{
String fileName = String.format("response-%05d.xml", i);
ResponseWrapper w = p.read(ResponseWrapper.class, new File(inputFilesDir,
fileName));
if (w.documents == null) continue;
for (Document d : w.documents)
{
documents.add(d);
}
}
}
/**
* Dump JVM info to the output folder.
*/
private static void dumpJVMInfo() throws Exception
{
String [] properties =
{
"java.runtime.name", "java.vm.version", "java.vm.vendor", "java.vm.name",
"java.vm.specification.name", "java.runtime.version", "os.arch",
"java.vm.specification.vendor", "os.name", "java.specification.name",
"sun.management.compiler", "os.version", "java.specification.version",
"java.vm.specification.version", "sun.arch.data.model",
"java.specification.vendor", "java.vm.info", "java.version", "java.vendor",
"sun.cpu.isalist",
};
Arrays.sort(properties);
File output = new File(outputFilesDir, "jvm.log");
Writer w = null;
try
{
w = new OutputStreamWriter(new FileOutputStream(output), "UTF-8");
w.write("Benchmark executed at: " + new Date() + "\n\n");
for (String prop : properties)
{
w.write(prop + "=" + System.getProperty(prop, "n/a") + "\n");
}
w.write("processors=" + Runtime.getRuntime().availableProcessors() + "\n");
w.write("\n");
}
finally
{
CloseableUtils.close(w);
}
}
/**
* Initialize static data.
*/
@BeforeClass
public static void initStaticData() throws Exception
{
overrideDefaults();
readData();
dumpJVMInfo();
controller = ControllerFactory.createPooling();
}
/**
* Perform the time/memory evaluation for a single algorithm.
*/
protected void evalShortDocs(String resultPrefix,
Class<? extends IProcessingComponent> algorithm, int MIN, int MAX, int STEP)
{
final Logger logger = LoggerFactory.getLogger(resultPrefix);
File output = new File(outputFilesDir, resultPrefix + ".log");
Writer w = null;
int docs = 0;
try
{
w = new OutputStreamWriter(new FileOutputStream(output), "UTF-8");
String header = "docs size[MB] time[s] mem[MB]";
w.write(header + "\n");
logger.info(header);
for (docs = MIN; docs < Math.min(MAX + 1, documents.size()); docs += STEP)
{
memClean();
memPeak();
final long start = now();
final HashMap<String, Object> attributes = Maps.newHashMap();
final List<Document> inputList = documents.subList(0, Math.min(docs,
documents.size()));
attributes.put(AttributeNames.DOCUMENTS, inputList);
// luceneIndex(inputList);
controller.process(attributes, algorithm);
final long end = now();
final double memUsedMB = memPeak() / (1024 * 1024.0);
final double timeSecs = (end - start) / 1000.0;
final double mbLength = countByteLength(inputList) / (1024 * 1024.0);
final int docsCount = inputList.size();
final String logLine = String.format(Locale.ENGLISH, "%d %.2f %.2f %.2f",
docsCount, mbLength, timeSecs, memUsedMB);
logger.info(logLine);
w.write(logLine + "\n");
w.flush();
}
}
catch (OutOfMemoryError e)
{
logger.warn("OOM at: " + docs);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
finally
{
CloseableUtils.close(w);
}
}
/**
* Index documents in-memory using Lucene.
*/
private void luceneIndex(List<Document> inputList)
{
try
{
Directory dir = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer());
IndexWriter w = new IndexWriter(dir, config);
for (Document d : inputList)
{
final org.apache.lucene.document.Document nd = new org.apache.lucene.document.Document();
nd.add(new TextField("title", StringUtils.defaultIfEmpty(d.getTitle(), ""), Store.NO));
nd.add(new TextField("snippet", StringUtils.defaultIfEmpty(d.getSummary(), ""), Store.NO));
w.addDocument(nd);
}
w.close();
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
/**
* Count the overall length of the input (titles and snippets). The length is
* expressed in characters.
*/
private static long countByteLength(List<Document> inputList)
{
long length = 0;
for (Document d : inputList)
{
if (d.getTitle() != null) length += d.getTitle().length();
if (d.getSummary() != null) length += d.getSummary().length();
}
return length;
}
/**
* @return Return {@link System#currentTimeMillis()}.
*/
private static long now()
{
return System.currentTimeMillis();
}
/**
* Best-effort attempt to force {@link System#gc()}.
*/
private static void memClean()
{
System.gc();
System.gc();
Thread.yield();
}
/**
* Return the peak number of bytes used (all memory pools) and reset the peak usage.
*/
private static long memPeak()
{
long peak = 0;
for (MemoryPoolMXBean b : ManagementFactory.getMemoryPoolMXBeans())
{
peak += b.getPeakUsage().getUsed();
b.resetPeakUsage();
}
return peak;
}
}