/*
* Concept profile generation tool suite
* Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center,
* Rotterdam, The Netherlands
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>
*/
package org.erasmusmc.collections.datasets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.erasmusmc.collections.OneToManyList;
import org.erasmusmc.utilities.ReadCSVFile;
import org.erasmusmc.utilities.WriteCSVFile;
public class DataSetInMemory implements DataSet {
private List<List<String>> data;
private Map<String, Integer> fieldName2ColumnIndex;
public static DataSetInMemory loadFromCSV(String filename){
DataSetInMemory dataSet = new DataSetInMemory();
Iterator<List<String>> iterator = new ReadCSVFile(filename).iterator();
//Parse header:
dataSet.fieldName2ColumnIndex = new HashMap<String, Integer>();
for (String header : iterator.next())
dataSet.fieldName2ColumnIndex.put(header, dataSet.fieldName2ColumnIndex.size());
dataSet.data = new ArrayList<List<String>>();
//Load data:
while (iterator.hasNext())
dataSet.data.add(iterator.next());
return dataSet;
}
public void writeToCSV(String filename){
WriteCSVFile out = new WriteCSVFile(filename);
out.write(getFieldNames());
for (List<String> line : data)
out.write(line);
out.close();
}
@Override
public Iterator<DataLine> iterator() {
return new DataIterator();
}
public DataSet innerJoin(DataSet other, String byField){
return innerJoin(other, new String[]{byField});
}
public DataSet innerJoin(DataSet other, String[] byFields) {
//Build index:
OneToManyList<String, List<String>> key2dataLine = new OneToManyList<String, List<String>>();
int[] indices = new int[byFields.length];
for (int i = 0; i < byFields.length; i++){
Integer index = fieldName2ColumnIndex.get(byFields[i]);
if (index == null)
throw new RuntimeException("Field name not found: " + byFields[i]);
indices[i] = index;
}
for (List<String> line : data){
StringBuilder key = new StringBuilder();
for (int index : indices){
if (index >= line.size())
throw new RuntimeException("Row " + data.indexOf(line) + " does not have enough columns (expected "+fieldName2ColumnIndex.size()+", found " + line.size()+")");
key.append(line.get(index));
key.append('\n');
}
key2dataLine.put(key.toString(), line);
}
//Create new dataset, and merge field names:
DataSetInMemory merged = new DataSetInMemory();
merged.data = new ArrayList<List<String>>();
merged.fieldName2ColumnIndex = new HashMap<String, Integer>(fieldName2ColumnIndex);
List<String> otherFieldNames = other.getFieldNames();
for (String byField : byFields)
otherFieldNames.remove(byField);
List<String> fieldsToCopy = new ArrayList<String>();
for (String field : otherFieldNames){
if (fieldName2ColumnIndex.containsKey(field))
System.err.println("Warning: duplicate field name found: " + field + ", ignoring one field");
else {
merged.fieldName2ColumnIndex.put(field, merged.fieldName2ColumnIndex.size());
fieldsToCopy.add(field);
}
}
//Merge data:
for (DataLine otherLine : other){
StringBuilder key = new StringBuilder();
for (String byField : byFields){
key.append(otherLine.get(byField));
key.append('\n');
}
List<List<String>> thisLines = key2dataLine.get(key.toString());
for (List<String> thisLine : thisLines){
List<String> newLine = new ArrayList<String>(thisLine);
for (String field : fieldsToCopy)
newLine.add(otherLine.get(field));
merged.data.add(newLine);
}
}
return merged;
}
@Override
public List<String> getFieldNames() {
int size = fieldName2ColumnIndex.size();
List<String> fieldNames = new ArrayList<String>(size);
for (int i = 0; i < size; i++)
fieldNames.add(null);
for (Map.Entry<String, Integer> entry : fieldName2ColumnIndex.entrySet())
fieldNames.set(entry.getValue(), entry.getKey());
return fieldNames;
}
private class DataIterator implements Iterator<DataLine>{
private Iterator<List<String>> iterator;
public DataIterator(){
iterator = data.iterator();
}
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public DataLine next() {
return new DataLineImplementation(iterator.next());
}
@Override
public void remove() {
iterator.remove();
}
}
private class DataLineImplementation implements DataLine {
private List<String> dataLine;
public DataLineImplementation(List<String> dataLine){
this.dataLine = dataLine;
}
public String get(String field) {
Integer index = fieldName2ColumnIndex.get(field);
if (index == null)
throw new RuntimeException("Field name not found: " + field);
if (index >= dataLine.size())
throw new RuntimeException("Row " + data.indexOf(dataLine) + " does not have enough columns (expected "+fieldName2ColumnIndex.size()+", found " + dataLine.size()+")");
return dataLine.get(index);
}
public int getInt(String fieldName){
return Integer.parseInt(get(fieldName));
}
public long getLong(String fieldName){
return Long.parseLong(get(fieldName));
}
public double getDouble(String fieldName){
return Double.parseDouble(get(fieldName));
}
}
@Override
public void renameField(String oldName, String newName) {
Integer index = fieldName2ColumnIndex.remove(oldName);
fieldName2ColumnIndex.put(newName, index);
}
}