package be.bagofwords.main.tests.bigrams;
import be.bagofwords.application.ApplicationContext;
import be.bagofwords.application.ApplicationManager;
import be.bagofwords.application.MainClass;
import be.bagofwords.application.MinimalApplicationContextFactory;
import be.bagofwords.db.DataInterface;
import be.bagofwords.db.DataInterfaceFactory;
import be.bagofwords.db.DatabaseCachingType;
import be.bagofwords.db.combinator.LongCombinator;
import be.bagofwords.db.experimental.kyoto.KyotoDataInterfaceFactory;
import be.bagofwords.db.remote.RemoteDatabaseInterfaceFactory;
import be.bagofwords.text.WordIterator;
import be.bagofwords.ui.UI;
import be.bagofwords.util.HashUtils;
import be.bagofwords.util.NumUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.mutable.MutableLong;
import java.io.*;
import java.util.Collections;
import java.util.HashMap;
import java.util.concurrent.CountDownLatch;
public class BigramTestsMain implements MainClass {
private static final long MIN_MILLION_ITEMS_TO_PROCESS = 1;
private static final long MAX_MILLION_ITEMS_TO_PROCESS = 256;
private static final File tmpDbDir = new File("/tmp/testBigramCounts");
private final File largeTextFile;
private final File bigramFile;
public BigramTestsMain(File largeTextFile, File bigramFile) {
this.largeTextFile = largeTextFile;
this.bigramFile = bigramFile;
}
public static void main(String[] args) throws IOException, InterruptedException {
if (args.length != 1) {
UI.writeError("Please provide the path to a large text file");
} else {
BigramTestsMain main = new BigramTestsMain(new File(args[0]), new File("/tmp/bigrams.bin"));
ApplicationManager.runSafely(main, new HashMap<>(), new MinimalApplicationContextFactory());
}
}
public void run(ApplicationContext context) {
try {
prepareTmpDir(tmpDbDir);
prepareBigrams();
runAllTests(DataType.LONG_COUNT, context);
// runAllTests(DataType.SERIALIZED_OBJECT);
} catch (Exception exp) {
throw new RuntimeException(exp);
}
}
private void prepareBigrams() throws IOException {
if (!bigramFile.exists() || bigramFile.length() == 0) {
UI.write("Writing bigrams in " + largeTextFile.getAbsolutePath() + " to " + bigramFile.getAbsolutePath());
BufferedReader rdr = new BufferedReader(new FileReader(largeTextFile));
DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(bigramFile)));
long numOfBigramsWritten = 0;
long bigramsToRead = MAX_MILLION_ITEMS_TO_PROCESS * 1000 * 1000 * 3;
while (rdr.ready() && numOfBigramsWritten < bigramsToRead) {
WordIterator wordIterator = new WordIterator(rdr.readLine(), Collections.<String>emptySet());
String prev = null;
while (wordIterator.hasNext()) {
String word = wordIterator.next().toString().toLowerCase();
if (prev != null) {
long bigram = HashUtils.hashCode(prev, " ", word);
dos.writeLong(bigram);
numOfBigramsWritten++;
}
prev = word;
}
}
IOUtils.closeQuietly(rdr);
IOUtils.closeQuietly(dos);
UI.write("Finished writing bigrams.");
}
}
private void runAllTests(DataType dataType, ApplicationContext applicationContext) throws InterruptedException, FileNotFoundException {
UI.write("Testing batch writing / reading for data type " + dataType);
// testSeparateWritingReading(dataType, new LevelDBDataInterfaceFactory(cachesManager, memoryManager, taskScheduler, tmpDbDir.getAbsolutePath() + "/levelDB"), DatabaseCachingType.DIRECT);
// testSeparateWritingReading(dataType, new FileDataInterfaceFactory(cachesManager, memoryManager, taskScheduler, tmpDbDir.getAbsolutePath() + "/fileDb"), DatabaseCachingType.CACHED_AND_BLOOM);
testSeparateWritingReading(dataType, new RemoteDatabaseInterfaceFactory(applicationContext), DatabaseCachingType.CACHED_AND_BLOOM);
// testSeparateWritingReading(dataType, new KyotoDataInterfaceFactory(cachesManager, memoryManager, taskScheduler, tmpDbDir.getAbsolutePath() + "/kyotoDB"), DatabaseCachingType.DIRECT);
// testSeparateWritingReading(dataType, new RocksDBDataInterfaceFactory(cachesManager, memoryManager, taskScheduler, tmpDbDir.getAbsolutePath() + "/rocksBD", false), DatabaseCachingType.DIRECT);
UI.write("Testing mixed writing / reading for data type " + dataType);
// testMixedWritingReading(dataType, new LevelDBDataInterfaceFactory(cachesManager, memoryManager, taskScheduler, tmpDbDir.getAbsolutePath() + "/levelDB"), DatabaseCachingType.DIRECT);
// testMixedWritingReading(dataType, new FileDataInterfaceFactory(cachesManager, memoryManager, taskScheduler, tmpDbDir.getAbsolutePath() + "/fileDb"), DatabaseCachingType.CACHED_AND_BLOOM);
// testMixedWritingReading(dataType, new RemoteDatabaseInterfaceFactory(cachesManager, memoryManager, taskScheduler, "localhost", 1208), DatabaseCachingType.CACHED_AND_BLOOM);
// testMixedWritingReading(dataType, new KyotoDataInterfaceFactory(cachesManager, memoryManager, taskScheduler, tmpDbDir.getAbsolutePath() + "/kyotoDB"), DatabaseCachingType.DIRECT);
// testMixedWritingReading(dataType, new RocksDBDataInterfaceFactory(cachesManager, memoryManager, taskScheduler, tmpDbDir.getAbsolutePath() + "/rocksBD", false), DatabaseCachingType.DIRECT);
}
private static void prepareTmpDir(File tmpDbDir) throws IOException {
if (tmpDbDir.exists()) {
FileUtils.deleteDirectory(tmpDbDir);
}
boolean success = tmpDbDir.mkdirs();
if (!success) {
throw new RuntimeException("Failed to create db dir " + tmpDbDir.getAbsolutePath());
}
}
private void testSeparateWritingReading(DataType dataType, DataInterfaceFactory factory, DatabaseCachingType type) throws InterruptedException, FileNotFoundException {
for (long items = MIN_MILLION_ITEMS_TO_PROCESS * 1024 * 1024; items <= MAX_MILLION_ITEMS_TO_PROCESS * 1024 * 1024; items *= 2) {
if (!(factory instanceof KyotoDataInterfaceFactory) || items < 256 * 1024 * 1024) {
testSeparateWritingReading(dataType, factory, type, 8, items);
}
}
factory.terminate();
}
private void testSeparateWritingReading(DataType dataType, DataInterfaceFactory factory, DatabaseCachingType cachingType, int numberOfThreads, long numberOfItems) throws FileNotFoundException, InterruptedException {
final DataInterface dataInterface = createDataInterface(dataType, cachingType, factory);
dataInterface.dropAllData();
final DataInputStream inputStream = new DataInputStream(new BufferedInputStream(new FileInputStream(bigramFile)));
//write data
MutableLong numberOfItemsWritten = new MutableLong(0);
CountDownLatch writeLatch = new CountDownLatch(numberOfThreads);
long startOfWrite = System.nanoTime();
for (int i = 0; i < numberOfThreads; i++) {
new BigramTestsThread(dataType, numberOfItemsWritten, numberOfItems, inputStream, dataInterface, writeLatch, false).start();
}
writeLatch.await();
dataInterface.flush();
long endOfWrite = System.nanoTime();
double writesPerSecond = numberOfItemsWritten.longValue() * 1e9 / (endOfWrite - startOfWrite);
dataInterface.optimizeForReading();
MutableLong numberOfItemsRead = new MutableLong(0);
CountDownLatch readLatch = new CountDownLatch(numberOfThreads);
long startOfRead = System.nanoTime();
for (int i = 0; i < numberOfThreads; i++) {
new BigramTestsThread(dataType, numberOfItemsRead, numberOfItems, inputStream, dataInterface, readLatch, true).start();
}
readLatch.await();
dataInterface.flush();
long endOfRead = System.nanoTime();
double readsPerSecond = numberOfItemsRead.longValue() * 1e9 / (endOfRead - startOfRead);
dataInterface.close();
UI.write(factory.getClass().getSimpleName() + " threads " + numberOfThreads + " items " + numberOfItems + " write " + NumUtils.fmt(writesPerSecond) + " read " + NumUtils.fmt(readsPerSecond));
}
private void testMixedWritingReading(DataType dataType, DataInterfaceFactory factory, DatabaseCachingType type) throws InterruptedException, FileNotFoundException {
for (long items = MIN_MILLION_ITEMS_TO_PROCESS * 1024 * 1024; items <= MAX_MILLION_ITEMS_TO_PROCESS * 1024 * 1024; items *= 2) {
if (!(factory instanceof KyotoDataInterfaceFactory) || items < 256 * 1024 * 1024) {
testMixedWritingReading(dataType, factory, type, largeTextFile, 8, items);
}
}
factory.terminate();
}
private void testMixedWritingReading(DataType dataType, DataInterfaceFactory factory, DatabaseCachingType cachingType, File largeTextFile, int numberOfThreads, long numberOfItems) throws FileNotFoundException, InterruptedException {
final DataInterface dataInterface = createDataInterface(dataType, cachingType, factory);
dataInterface.dropAllData();
final DataInputStream inputStream = new DataInputStream(new BufferedInputStream(new FileInputStream(bigramFile)));
//first fill database by writing data
MutableLong numberOfItemsWritten = new MutableLong(0);
CountDownLatch writeLatch = new CountDownLatch(numberOfThreads);
for (int i = 0; i < numberOfThreads; i++) {
new BigramTestsThread(dataType, numberOfItemsWritten, numberOfItems, inputStream, dataInterface, writeLatch, false).start();
}
writeLatch.await();
dataInterface.flush();
//now start threads that will write and read data simultaneously
dataInterface.optimizeForReading();
MutableLong numberOfItemsRead = new MutableLong(0);
numberOfItemsWritten = new MutableLong(0);
CountDownLatch readLatch = new CountDownLatch(numberOfThreads / 2);
writeLatch = new CountDownLatch(numberOfThreads / 2);
long start = System.nanoTime();
for (int i = 0; i < numberOfThreads; i++) {
boolean isReadThread = i % 2 == 0;
new BigramTestsThread(dataType, isReadThread ? numberOfItemsRead : numberOfItemsWritten, Math.min(100 * 1024 * 1024, numberOfItems), inputStream, dataInterface, isReadThread ? readLatch : writeLatch, isReadThread).start();
}
readLatch.await(); //this assumes that reading data is faster than writing data.
long endOfRead = System.nanoTime();
writeLatch.await();
dataInterface.flush();
long endOfWrite = System.nanoTime();
double readsPerSecond = numberOfItemsRead.longValue() * 1e9 / (endOfRead - start);
double writesPerSecond = numberOfItemsWritten.longValue() * 1e9 / (endOfWrite - start);
dataInterface.close();
UI.write(factory.getClass().getSimpleName() + " threads " + numberOfThreads + " items " + numberOfItems + " write " + NumUtils.fmt(writesPerSecond) + " read " + NumUtils.fmt(readsPerSecond));
}
protected DataInterface createDataInterface(DataType dataType, DatabaseCachingType cachingType, DataInterfaceFactory factory) {
String dataInterfaceName = "readWriteBigrams_" + dataType + "_" + cachingType + "_" + factory.getClass().getSimpleName();
switch (dataType) {
case LONG_COUNT:
return factory.createDataInterface(cachingType, dataInterfaceName, Long.class, new LongCombinator());
case SERIALIZED_OBJECT:
return factory.createDataInterface(cachingType, dataInterfaceName, BigramCount.class, new BigramCountCombinator());
default:
throw new RuntimeException("Unknown data type " + dataType);
}
}
}