package org.molgenis.data.csv;
import au.com.bytecode.opencsv.CSVReader;
import org.apache.commons.io.IOUtils;
import org.molgenis.data.Entity;
import org.molgenis.data.MolgenisDataException;
import org.molgenis.data.UnknownEntityException;
import org.molgenis.data.meta.model.EntityType;
import org.molgenis.data.processor.AbstractCellProcessor;
import org.molgenis.data.processor.CellProcessor;
import org.molgenis.data.support.DynamicEntity;
import org.molgenis.data.support.GenericImporterExtensions;
import org.molgenis.util.CloseableIterator;
import org.springframework.util.StringUtils;
import java.io.*;
import java.nio.charset.Charset;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
public class CsvIterator implements CloseableIterator<Entity>
{
private static final Charset CHARSET = Charset.forName("UTF-8");
private final String repositoryName;
private final EntityType entityType;
private ZipFile zipFile;
private CSVReader csvReader;
private final List<CellProcessor> cellProcessors;
private final Map<String, Integer> colNamesMap; // column names index
private Entity next;
private boolean getNext = true;
private Character separator = null;
public CsvIterator(File file, String repositoryName, List<CellProcessor> cellProcessors, Character separator)
{
this(file, repositoryName, cellProcessors, separator, null);
}
public CsvIterator(File file, String repositoryName, List<CellProcessor> cellProcessors, Character separator,
EntityType entityType)
{
this.repositoryName = repositoryName;
this.cellProcessors = cellProcessors;
this.separator = separator;
this.entityType = entityType;
try
{
if (StringUtils.getFilenameExtension(file.getName())
.equalsIgnoreCase(GenericImporterExtensions.ZIP.toString()))
{
zipFile = new ZipFile(file.getAbsolutePath());
for (Enumeration<? extends ZipEntry> e = zipFile.entries(); e.hasMoreElements(); )
{
ZipEntry entry = e.nextElement();
if (StringUtils.stripFilenameExtension(entry.getName()).equalsIgnoreCase(repositoryName))
{
csvReader = createCSVReader(entry.getName(), zipFile.getInputStream(entry));
break;
}
}
}
else if (file.getName().toLowerCase().startsWith(repositoryName.toLowerCase()))
{
csvReader = createCSVReader(file.getName(), new FileInputStream(file));
}
if (csvReader == null)
{
throw new UnknownEntityException("Unknown entity [" + repositoryName + "] ");
}
colNamesMap = toColNamesMap(csvReader.readNext());
}
catch (IOException e)
{
throw new MolgenisDataException("Exception reading [" + file.getAbsolutePath() + "]", e);
}
}
public Map<String, Integer> getColNamesMap()
{
return colNamesMap;
}
@Override
public boolean hasNext()
{
boolean next = get() != null;
if (!next)
{
close();
}
return next;
}
@Override
public Entity next()
{
Entity entity = get();
getNext = true;
return entity;
}
private Entity get()
{
if (getNext)
{
try
{
String[] values = csvReader.readNext();
if ((values != null) && (values.length >= colNamesMap.size()))
{
List<String> valueList = Arrays.asList(values);
for (int i = 0; i < values.length; ++i)
{
// subsequent separators indicate
// null
// values instead of empty strings
String value = values[i].isEmpty() ? null : values[i];
values[i] = processCell(value, false);
}
next = new DynamicEntity(entityType);
for (String name : colNamesMap.keySet())
{
next.set(name, valueList.get(colNamesMap.get(name)));
}
}
else
{
next = null;
}
getNext = false;
}
catch (IOException e)
{
throw new MolgenisDataException("Exception reading line of csv file [" + repositoryName + "]", e);
}
}
return next;
}
@Override
public void remove()
{
throw new UnsupportedOperationException();
}
@Override
public void close()
{
IOUtils.closeQuietly(csvReader);
if (zipFile != null)
{
IOUtils.closeQuietly(zipFile);
}
}
private CSVReader createCSVReader(String fileName, InputStream in)
{
Reader reader = new InputStreamReader(in, CHARSET);
if (null == separator)
{
if (fileName.toLowerCase().endsWith('.' + GenericImporterExtensions.CSV.toString()) || fileName
.toLowerCase().endsWith('.' + GenericImporterExtensions.TXT.toString()))
{
return new CSVReader(reader);
}
if (fileName.toLowerCase().endsWith('.' + GenericImporterExtensions.TSV.toString()))
{
return new CSVReader(reader, '\t');
}
throw new MolgenisDataException("Unknown file type: [" + fileName + "] for csv repository");
}
return new CSVReader(reader, this.separator);
}
private Map<String, Integer> toColNamesMap(String[] headers)
{
if ((headers == null) || (headers.length == 0)) return Collections.emptyMap();
int capacity = (int) (headers.length / 0.75) + 1;
Map<String, Integer> columnIdx = new LinkedHashMap<>(capacity);
for (int i = 0; i < headers.length; ++i)
{
String header = processCell(headers[i], true);
columnIdx.put(header, i);
}
return columnIdx;
}
private String processCell(String value, boolean isHeader)
{
return AbstractCellProcessor.processCell(value, isHeader, cellProcessors);
}
}