package com.github.lwhite1.tablesaw.examples;
import com.github.lwhite1.tablesaw.api.CategoryColumn;
import com.github.lwhite1.tablesaw.api.ColumnType;
import com.github.lwhite1.tablesaw.api.DateColumn;
import com.github.lwhite1.tablesaw.api.FloatColumn;
import com.github.lwhite1.tablesaw.api.IntColumn;
import com.github.lwhite1.tablesaw.api.Table;
import com.github.lwhite1.tablesaw.columns.packeddata.PackedLocalDate;
import com.github.lwhite1.tablesaw.index.IntIndex;
import com.github.lwhite1.tablesaw.io.csv.CsvReader;
import com.github.lwhite1.tablesaw.store.StorageManager;
import com.github.lwhite1.tablesaw.util.Selection;
import com.google.common.base.Stopwatch;
import com.opencsv.CSVWriter;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.RandomUtils;
import java.io.FileWriter;
import java.io.IOException;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import static com.github.lwhite1.tablesaw.api.QueryHelper.column;
import static java.lang.System.out;
/**
* Tests manipulation of large (but not big) data sets
*/
public class ObservationDataTest {
private static final String CSV_FILE = "/Users/larrywhite/IdeaProjects/testdata/obs.csv";
private static final String DB = "/Users/larrywhite/IdeaProjects/testdata/nobs.csv.saw";
// pools to get random test data from
private static List<String> concepts = new ArrayList<>(100_000);
private static IntArrayList patientIds = new IntArrayList(1_000_000);
private static int size = 60 * 365;
private static IntArrayList dates = new IntArrayList(size);
public static void main(String[] args) throws Exception {
int numberOfRecordsInTable = 500_000_000;
Stopwatch stopwatch = Stopwatch.createStarted();
Table t;
// t = defineSchema();
// generateTestData(t, numberOfRecordsInTable, stopwatch);
// t = loadFromCsv(stopwatch);
t = loadFromColumnStore(stopwatch);
t.setName("Observations");
//writeToColumnStore(t, stopwatch);
String randomConcept1 = t.categoryColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));
String randomConcept2 = t.categoryColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));
int randomPatient1 = t.intColumn("patient").get(RandomUtils.nextInt(0, t.rowCount()));
int randomPatient2 = t.intColumn("patient").get(RandomUtils.nextInt(0, t.rowCount()));
stopwatch.reset().start();
Table result = t.selectWhere(column("concept").isEqualTo(randomConcept1));
out.println("concept found in " + stopwatch.elapsed(TimeUnit.MICROSECONDS) + " micros");
out.println("results found: " + result.rowCount());
out.println();
stopwatch.reset().start();
result = t.selectWhere(
column("concept").isEqualTo(randomConcept2));
out.println("concept found in " + stopwatch.elapsed(TimeUnit.MICROSECONDS) + " micros");
out.println("results found: " + result.rowCount());
out.println();
stopwatch.reset().start();
IntColumn patients = t.intColumn("patient");
IntIndex patientIndex = new IntIndex(patients);
out.println("patient index built in " + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
out.println();
stopwatch.reset().start();
Selection selection = patientIndex.get(randomPatient1);
out.println("patient found in " + stopwatch.elapsed(TimeUnit.MICROSECONDS) + " micros");
out.println("patient records found: " + selection.size());
stopwatch.reset().start();
Table results = t.selectWhere(selection);
out.println("records retrieved in " + stopwatch.elapsed(TimeUnit.MICROSECONDS) + " micros");
out.println();
stopwatch.reset().start();
selection = patientIndex.get(randomPatient2);
out.println("patient found in " + stopwatch.elapsed(TimeUnit.MICROSECONDS) + " micros");
out.println("patients records found: " + selection.size());
stopwatch.reset().start();
results = t.selectWhere(selection);
out.println("records retrieved in " + stopwatch.elapsed(TimeUnit.MICROSECONDS) + " micros");
out.println();
stopwatch.reset().start();
t.floatColumn("value").sum();
out.println("Time to sum floats: " + stopwatch.elapsed(TimeUnit.MILLISECONDS) + "ms");
System.exit(0);
}
private static Table loadFromColumnStore(Stopwatch stopwatch) throws IOException {
stopwatch.reset().start();
Table t = StorageManager.readTable(DB);
out.println("Loaded from column store in " + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
return t;
}
private static Table defineSchema() {
Table t;
t = Table.create("Observations");
CategoryColumn conceptId = CategoryColumn.create("concept");
DateColumn date = DateColumn.create("date");
FloatColumn value = FloatColumn.create("value");
IntColumn patientId = IntColumn.create("patient");
t.addColumn(conceptId);
t.addColumn(date);
t.addColumn(value);
t.addColumn(patientId);
return t;
}
private static void generateTestData(Table t, int numberOfRecordsInTable, Stopwatch stopwatch) throws IOException {
stopwatch.reset().start();
out.println("Generating test data");
generateData(numberOfRecordsInTable, t);
out.println("Time to generate "
+ numberOfRecordsInTable + " records: "
+ stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
}
private static void writeToColumnStore(Table t, Stopwatch stopwatch) throws Exception {
stopwatch = stopwatch.reset().start();
StorageManager.saveTable(DB, t);
out.println("Time to write out in columnStore format " + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
}
private static Table loadFromCsv(Stopwatch stopwatch) throws IOException {
stopwatch.reset().start();
Table t;// ConceptId, Date, Value, PatientNo
ColumnType[] columnTypes = {ColumnType.CATEGORY, ColumnType.LOCAL_DATE, ColumnType.FLOAT, ColumnType.INTEGER};
t = CsvReader.read(columnTypes, CSV_FILE);
out.println("Time to read to CSV File " + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
return t;
}
private static void generateData(int observationCount, Table table) throws IOException {
// createFromCsv pools of random values
while (concepts.size() <= 100_000) {
concepts.add(RandomStringUtils.randomAscii(30));
}
while (patientIds.size() <= 1_000_000) {
patientIds.add(RandomUtils.nextInt(0, 2_000_000_000));
}
while (dates.size() <= size) {
dates.add(PackedLocalDate.pack(randomDate()));
}
DateColumn dateColumn = table.dateColumn("date");
CategoryColumn conceptColumn = table.categoryColumn("concept");
FloatColumn valueColumn = table.floatColumn("value");
IntColumn patientColumn = table.intColumn("patient");
CSVWriter writer = new CSVWriter(new FileWriter(CSV_FILE));
String[] line = new String[4];
String[] header = {"concept", "date", "value", "patient"};
writer.writeNext(header);
// sample from the pools to write the data
for (int i = 0; i < observationCount; i++) {
line[0] = concepts.get(RandomUtils.nextInt(0, concepts.size()));
line[1] = PackedLocalDate.toDateString(dates.getInt(RandomUtils.nextInt(0, dates.size())));
line[2] = Float.toString(RandomUtils.nextFloat(0f, 100_000f));
line[3] = Integer.toString(patientIds.getInt(RandomUtils.nextInt(0, patientIds.size())));
writer.writeNext(line);
/*
dateColumn.add(dates.getInt(RandomUtils.nextInt(0, dates.size())));
conceptColumn.add(concepts.get(RandomUtils.nextInt(0, concepts.size())));
valueColumn.add(RandomUtils.nextFloat(0f, 100_000f));
patientColumn.add(patientIds.getInt(RandomUtils.nextInt(0, patientIds.size())));
*/
}
writer.flush();
writer.close();
concepts = null;
patientIds = null;
dates = null;
}
private static LocalDate randomDate() {
Random random = new Random();
int minDay = (int) LocalDate.of(1920, 1, 1).toEpochDay();
int maxDay = (int) LocalDate.of(2016, 1, 1).toEpochDay();
long randomDay = minDay + random.nextInt(maxDay - minDay);
return LocalDate.ofEpochDay(randomDay);
}
}