package com.github.lwhite1.tablesaw.filters;
import com.github.lwhite1.tablesaw.api.Table;
import com.github.lwhite1.tablesaw.api.CategoryColumn;
import com.github.lwhite1.tablesaw.columns.ColumnReference;
import com.github.lwhite1.tablesaw.api.DateColumn;
import com.github.lwhite1.tablesaw.api.FloatColumn;
import com.github.lwhite1.tablesaw.api.IntColumn;
import com.github.lwhite1.tablesaw.columns.packeddata.PackedLocalDate;
import com.github.lwhite1.tablesaw.table.TemporaryView;
import com.github.lwhite1.tablesaw.table.ViewGroup;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Range;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeSet;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.RandomUtils;
import java.io.IOException;
import java.time.LocalDate;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.TimeUnit;
import static com.github.lwhite1.tablesaw.api.QueryHelper.*;
import static java.lang.System.out;
/**
* Tests manipulation of large (but not big) data sets
*/
public class TimeDependentFilteringTest {
private static final int CONCEPT_COUNT = 10;
private static final int PATIENT_COUNT = 10_000;
// pools to get random test data from
private static List<String> concepts = new ArrayList<>(CONCEPT_COUNT);
private static IntArrayList patientIds = new IntArrayList(PATIENT_COUNT);
private static int size = 60 * 365;
private static IntArrayList dates = new IntArrayList(size);
public static void main(String[] args) throws Exception {
int numberOfRecordsInTable = 100_000;
Stopwatch stopwatch = Stopwatch.createStarted();
Table t = defineSchema();
generateTestData(t, numberOfRecordsInTable, stopwatch);
t.setName("Observations");
// non temporal constraints
String conceptA = t.categoryColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));
String conceptB = t.categoryColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));
// independent temporal constraints
String conceptZ = t.categoryColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));
String conceptD = t.categoryColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));
DependencyFilter independentConstraintFilter = DependencyFilter.FIRST;
// dependent temporal constraints
String conceptE = t.categoryColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));
String conceptF = t.categoryColumn("concept").get(RandomUtils.nextInt(0, t.rowCount()));
DependencyFilter dependentConstraintFilter = DependencyFilter.ANY;
// temporal dependency range constraint
Range<Integer> daysConstraint = Range.closed(0, 0);
ColumnReference concept = column("concept");
//Non-temporal clause
Table nt = t.selectWhere(
both(concept.isEqualTo(conceptA),
(concept.isNotEqualTo(conceptB))));
IntColumn ntPatients = nt.intColumn("patient");
// Group the original table by patient id
ViewGroup patients = ViewGroup.create(t, "patient");
// Create a list of patient sub-tables to work with TODO(lwhite): Build the copy-on-write to ViewGroups to avoid
CopyOnWriteArrayList<TemporaryView> patientTables = new CopyOnWriteArrayList<>(patients.getSubTables());
// Apply the independent temporal event filtering to the patient subtables and remove any that don't pass
for (TemporaryView patientTable : patients) {
CategoryColumn concepts = patientTable.categoryColumn("concept");
int patientId = Integer.parseInt(patientTable.name());
if (!concepts.contains(conceptZ)
|| concepts.contains(conceptD)) {
patientTables.remove(patientTable);
} else if (!ntPatients.contains(patientId)) { // filtering out the non-temporal now constraints for
// efficiency
patientTables.remove(patientTable);
}
}
List<IndependentResult> independentResults = new ArrayList<>();
// Working with the filtered patient tables, calculate the event dates for the independent events
for (TemporaryView patientTable : patientTables) {
IndependentResult result = new IndependentResult(Integer.parseInt(patientTable.name()));
List<LocalDate> eventDates = new ArrayList<>();
// iterate an individual table and find the rows where concept matches the target concept
for (int row : patientTable) {
CategoryColumn concepts = patientTable.categoryColumn("concept");
DateColumn dates = patientTable.dateColumn("date");
if (concepts.get(row).equals(conceptZ)) {
eventDates.add(dates.get(row));
}
}
if (independentConstraintFilter == DependencyFilter.FIRST) {
if (eventDates.isEmpty()) {
// this is an error
System.out.println(patientTable.name());
} else { //Get the first event for the current patient and createFromCsv a date range around it
LocalDate date = eventDates.get(0);
result.addRange(Range.closed(date.minusDays(daysConstraint.lowerEndpoint()),
date.plusDays(daysConstraint.upperEndpoint())));
} //TODO handle last and any cases
}
independentResults.add(result);
}
for (TemporaryView patientTable : patientTables) {
// for every date range in rangeSet
// .. find any rows containing events matching the rangeSet
// .. run dependent clause on those rows
}
System.out.println("Done");
}
private static Table defineSchema() {
Table t;
t = Table.create("Observations");
CategoryColumn conceptId = CategoryColumn.create("concept");
DateColumn date = DateColumn.create("date");
FloatColumn value = FloatColumn.create("value");
IntColumn patientId = IntColumn.create("patient");
t.addColumn(conceptId);
t.addColumn(date);
t.addColumn(value);
t.addColumn(patientId);
return t;
}
private static void generateTestData(Table t, int numberOfRecordsInTable, Stopwatch stopwatch) throws IOException {
stopwatch.reset().start();
out.println("Generating test data");
generateData(numberOfRecordsInTable, t);
out.println("Time to generate "
+ numberOfRecordsInTable + " records: "
+ stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
}
private static void generateData(int observationCount, Table table) throws IOException {
// createFromCsv pools of random values
while (concepts.size() <= CONCEPT_COUNT) {
concepts.add(RandomStringUtils.randomAscii(30));
}
while (patientIds.size() <= PATIENT_COUNT) {
patientIds.add(RandomUtils.nextInt(0, 2_000_000_000));
}
while (dates.size() <= size) {
dates.add(PackedLocalDate.pack(randomDate()));
}
DateColumn dateColumn = table.dateColumn("date");
CategoryColumn conceptColumn = table.categoryColumn("concept");
FloatColumn valueColumn = table.floatColumn("value");
IntColumn patientColumn = table.intColumn("patient");
// sample from the pools to write the data
for (int i = 0; i < observationCount; i++) {
dateColumn.add(dates.getInt(RandomUtils.nextInt(0, dates.size())));
conceptColumn.add(concepts.get(RandomUtils.nextInt(0, concepts.size())));
valueColumn.add(RandomUtils.nextFloat(0f, 100_000f));
patientColumn.add(patientIds.getInt(RandomUtils.nextInt(0, patientIds.size())));
}
}
// TODO(lwhite): Put this in a Test utils class
private static LocalDate randomDate() {
Random random = new Random();
int minDay = (int) LocalDate.of(2000, 1, 1).toEpochDay();
int maxDay = (int) LocalDate.of(2016, 1, 1).toEpochDay();
long randomDay = minDay + random.nextInt(maxDay - minDay);
return LocalDate.ofEpochDay(randomDay);
}
private static class IndependentResult {
int patientId;
RangeSet<LocalDate> dateRanges = TreeRangeSet.create();
IndependentResult(int patientId) {
this.patientId = patientId;
}
void addRange(Range<LocalDate> dateRange) {
dateRanges.add(dateRange);
}
}
private static enum DependencyFilter {
FIRST,
LAST,
ANY
}
}