package com.github.lwhite1.tablesaw.examples;
import com.github.lwhite1.tablesaw.api.Table;
import com.github.lwhite1.tablesaw.TestDataUtil;
import com.github.lwhite1.tablesaw.api.ColumnType;
import com.github.lwhite1.tablesaw.api.CategoryColumn;
import com.github.lwhite1.tablesaw.api.DateColumn;
import com.github.lwhite1.tablesaw.columns.packeddata.PackedLocalDate;
import com.github.lwhite1.tablesaw.io.csv.CsvReader;
import com.github.lwhite1.tablesaw.io.csv.CsvWriter;
import com.github.lwhite1.tablesaw.store.StorageManager;
import com.github.lwhite1.tablesaw.api.BooleanColumn;
import com.github.lwhite1.tablesaw.api.ShortColumn;
import com.google.common.base.Stopwatch;
import io.codearte.jfairy.Fairy;
import io.codearte.jfairy.producer.person.Person;
import java.time.LocalDate;
import java.util.concurrent.TimeUnit;
/**
* Tests manipulation of large (but not big) data sets
*/
public class LargeDataTest {
private static String CSV_FILE = "bigdata/people1.csv";
public static void main(String[] args) throws Exception {
Stopwatch stopwatch = Stopwatch.createStarted();
ColumnType[] columnTypes = {ColumnType.CATEGORY, ColumnType.CATEGORY, ColumnType.CATEGORY, ColumnType
.CATEGORY, ColumnType.CATEGORY, ColumnType.CATEGORY, ColumnType.LOCAL_DATE, ColumnType.SHORT_INT,
ColumnType.SHORT_INT, ColumnType.BOOLEAN};
Table t = CsvReader.read(columnTypes, CSV_FILE);
System.out.println("Time to read from CSV File " + stopwatch.elapsed(TimeUnit.SECONDS));
stopwatch = stopwatch.reset().start();
storeInDb(t);
System.out.println("Time to store in columnStore " + stopwatch.elapsed(TimeUnit.SECONDS));
stopwatch.reset().start();
System.out.println(t.categoryColumn("first name").first(5).print());
System.out.println("Time to print first 5 from first name column " + stopwatch.elapsed(TimeUnit.MILLISECONDS)
+ " ms");
System.out.println();
stopwatch.reset().start();
System.out.println(t.shortColumn("weight").summary().print());
System.out.println("Time to summarize weight column " + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
System.out.println();
stopwatch.reset().start();
System.out.println(t.shortColumn("height").summary().print());
System.out.println("Time to summarize height column " + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
System.out.println();
stopwatch.reset().start();
System.out.println(t.first(5).print());
System.out.println("Time to print first(5) " + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
System.out.println();
stopwatch.reset().start();
System.out.println(t.structure().print());
System.out.println("Time to print structure " + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
System.out.println();
stopwatch.reset().start();
CsvWriter.write("bigdata/shortpeople2.csv", t);
System.out.println("Time to write csv file " + stopwatch.elapsed(TimeUnit.SECONDS));
System.out.println();
}
private static void createPeople(Table t) throws Exception {
/* try (CSVWriter writer = new CSVWriter(new FileWriter("people.csv"))) {
String[] header = {"first name", "last name", "company",
"city", "postal code", "state", "birthdate", "height", "weight", "female"};
writer.writeNext(header);
for (int r = 0; r < 300_000_000; r++) {
if (r % 1_000_000 == 0) {
System.out.println(r);
writer.flush();
}
String[] entries = new String[header.length];
entries[0] = person.firstName();
entries[1] = person.lastName();
entries[2] = person.getCompany().name();
entries[3] = person.dateOfBirth().toLocalDate().toString();
entries[4] = person.getAddress().getCity();
entries[5] = person.getAddress().getPostalCode();
entries[6] = fairy.baseProducer().randomElement(usStateArray);
entries[7] = String.column(fairy.baseProducer().randomBetween(65, 280));
entries[8] = String.column(fairy.baseProducer().randomBetween(64, 78));
entries[9] = String.column(person.isFemale());
writer.writeNext(entries);
}
}*/
}
private static Table createPeoples(int quantity) throws Exception {
Stopwatch stopwatch = Stopwatch.createStarted();
Fairy fairy = Fairy.create();
Table t = Table.create("People");
CategoryColumn fName = CategoryColumn.create("first name");
CategoryColumn lName = CategoryColumn.create("last name");
CategoryColumn company = CategoryColumn.create("company");
CategoryColumn city = CategoryColumn.create("city");
CategoryColumn postalCode = CategoryColumn.create("postal code");
CategoryColumn state = CategoryColumn.create("state");
DateColumn birthDate = DateColumn.create("birth date");
ShortColumn height = ShortColumn.create("height");
ShortColumn weight = ShortColumn.create("weight");
BooleanColumn female = BooleanColumn.create("female");
t.addColumn(fName);
t.addColumn(lName);
t.addColumn(company);
t.addColumn(city);
t.addColumn(postalCode);
t.addColumn(state);
t.addColumn(birthDate);
t.addColumn(height);
t.addColumn(weight);
t.addColumn(female);
Person person;
for (int r = 0; r < quantity; r++) {
if (r % 1_000_000 == 0) {
System.out.println(r);
}
person = fairy.person();
fName.add(person.firstName());
lName.add(person.lastName());
company.add(person.getCompany().name());
birthDate.add(PackedLocalDate.pack(LocalDate.parse(person.dateOfBirth().toLocalDate().toString())));
city.add(person.getAddress().getCity());
postalCode.add(person.getAddress().getPostalCode());
state.add(TestDataUtil.randomUsState());
weight.add((short) fairy.baseProducer().randomBetween(65, 280));
height.add((short) fairy.baseProducer().randomBetween(64, 78));
female.add(person.isFemale());
}
System.out.println("Time to generate " + stopwatch.elapsed(TimeUnit.SECONDS));
return t;
}
private static void storeInDb() throws Exception {
ColumnType[] columnTypes = {ColumnType.CATEGORY, ColumnType.CATEGORY, ColumnType.CATEGORY, ColumnType
.LOCAL_DATE, ColumnType.CATEGORY, ColumnType.CATEGORY, ColumnType.CATEGORY, ColumnType.SHORT_INT, ColumnType.SHORT_INT, ColumnType.BOOLEAN, ColumnType.BOOLEAN};
Table t = CsvReader.read(columnTypes, CSV_FILE);
StorageManager.saveTable("bigdata/people", t);
}
private static void storeInDb(Table t) throws Exception {
StorageManager.saveTable("bigdata/peopleShort2", t);
}
}