// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.demux;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Comparator;
import java.util.Collection;
import java.util.Collections;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.IOException;
import java.util.regex.*;
public class SampleSheet implements Iterable<SampleSheet.Entry>
{
public static class FormatException extends Exception {
private static final long serialVersionUID = 1L;
public FormatException(String msg) {
super(msg);
}
}
/** Expected length of tag sequence. */
public static final int BAR_CODE_MIN_LENGTH = 6;
public static final int BAR_CODE_MAX_LENGTH = 12;
// defaults
private static final int InitNumEntries = 100;
private static final Pattern QuotePattern = Pattern.compile("^\"|\"$");
private static final String ExpectedHeading = "fcid,lane,sampleid,sampleref,index,description,control,recipe,operator";
private enum Heading {
fcid,
lane,
sampleid,
sampleref,
index,
description,
control,
recipe,
operator,
sampleproject;
}
private static final EnumSet<Heading> RequiredColumns;
static {
RequiredColumns = EnumSet.of( Heading.lane, Heading.sampleid, Heading.index );
}
// private fields
/* The table an ArrayList, where value at i corresponds to lane i+1.
* Each position contains a HashMap that maps DNA tags to sample names, for that lane.
*/
private ArrayList<Entry> table;
private Matcher quoteMatcher = QuotePattern.matcher("");
public SampleSheet()
{
// Create an empty table for consistency. It will be trashed when
// a file is loaded.
table = new ArrayList<Entry>(0);
}
public SampleSheet(Reader in) throws IOException, FormatException
{
loadTable(in);
}
/**
* Scans the heading line and returns a Map from (normalized) column name to index.
*
* @exception FormatException Thrown if the heading line doesn't contain the required columns.
*/
private static EnumMap<Heading, Integer> getColumnIndices(String headingLine) throws FormatException
{
// Verify that heading is as expected, ignoring quotes and case
String[] headingStrings = headingLine.replaceAll("\"", "").split(",");
if (headingStrings.length <= 1)
throw new FormatException("Bad sample sheet format. Expecting a heading such as " + ExpectedHeading);
EnumMap<Heading, Integer> columns = new EnumMap<Heading, Integer>(Heading.class);
for (int idx = 0; idx < headingStrings.length; ++idx)
{
try {
Heading h = Heading.valueOf(headingStrings[idx].toLowerCase());
columns.put(h, idx);
}
catch (IllegalArgumentException e) {
throw new FormatException("Unrecognized sample sheet heading '" + headingStrings[idx] + "'");
}
}
if (!columns.keySet().containsAll(RequiredColumns))
{
EnumSet<Heading> missingColumns = RequiredColumns.clone();
missingColumns.removeAll(columns.keySet());
throw new FormatException("sample sheet is missing required columns " + missingColumns.toString());
}
return columns;
}
public void loadTable(Reader in) throws IOException, FormatException
{
table = new ArrayList<Entry>(InitNumEntries);
String line = null;
LineNumberReader input = new LineNumberReader(in);
line = input.readLine(); // First line. Should be the table header.
if (line == null)
throw new FormatException("Empty sample sheet");
EnumMap<Heading, Integer> columnMap = getColumnIndices(line);
line = input.readLine();
while (line != null)
{
insertRecord(columnMap, line);
line = input.readLine();
}
if (table.size() > 1)
{
// Check for duplicates barcodes in the same lane
// start by sorting the table by lane
Collections.sort(table, new Comparator<Entry>() {
@Override
public int compare(Entry a, Entry b) {
return a.getLane() - b.getLane();
}
});
HashSet<String> samplesInLane = new HashSet<String>();
int currentLane = table.get(0).getLane();
for (Entry e: table) // table is an ArrayList of Entries
{
if (e.getLane() == currentLane)
{
if (samplesInLane.contains(e.getIndex()))
throw new FormatException("index " + e.getIndex() + " appears twice for the same lane " + currentLane);
else
samplesInLane.add(e.getIndex());
}
else
{
// lane change
samplesInLane.clear();
samplesInLane.add(e.getIndex());
currentLane = e.getLane();
}
}
}
}
private void insertRecord(final EnumMap<Heading, Integer> columns, String line) throws FormatException
{
String[] fields = line.split(",");
if (fields.length != columns.size())
{
throw new FormatException("Number of fields in sample sheet row different from heading. Expecing " +
columns.size() + " fields but found " + fields.length + ". Line: " + line);
}
// Format is CSV with at least the columns specified by RequiredColumns.
// E.g., "FCID","Lane","SampleID","SampleRef","Index","Description","Control","Recipe","Operator"
// All text fields are quoted (all except Lane)
// remove external quotes and whitespace from all string fields (even spaces within the quotes)
for (int i = 0; i < fields.length; ++i)
{
quoteMatcher.reset(fields[i]);
fields[i] = quoteMatcher.replaceAll("").trim();
}
Entry entry;
try {
entry = Entry.createEntry(columns, fields);
}
catch (IllegalArgumentException e) {
throw new FormatException(e.getMessage() + ". Line: " + line);
}
table.add(entry);
}
public Set<String> getSamplesInLane(int lane)
{
if (lane <= 0)
throw new IllegalArgumentException("Invalid negative lane number " + lane);
HashSet<String> samples = new HashSet<String>(table.size() / 8);
for (Entry e: this)
{
if (lane == e.getLane())
samples.add(e.getSampleId());
}
return samples;
}
public Set<String> getSamples()
{
HashSet<String> uniqueSamples = new HashSet<String>(table.size());
for (Entry e: table)
uniqueSamples.add(e.getSampleId());
return uniqueSamples;
}
public int size() { return table.size(); }
public boolean isEmpty() { return table.isEmpty(); }
public Iterator<Entry> iterator() { return new SIterator(table); }
/**
* Decorates our Entry table's iterator to disable the remove() method.
*/
protected static class SIterator implements Iterator<Entry> {
private Iterator<Entry> base;
public SIterator(List<Entry> table) {
base = table.iterator();
}
public boolean hasNext() { return base.hasNext(); }
public Entry next() { return base.next(); }
public void remove() {
throw new UnsupportedOperationException();
}
}
public static class Entry
{
private String flowcellId;
private int lane;
private String sampleId;
private String sampleRef;
private String index;
private String description;
private String control;
private String recipe;
private String operator;
private String project;
public String getFlowcellId() { return flowcellId; }
public int getLane() { return lane; }
public String getSampleId() { return sampleId; }
public String getSampleRef() { return sampleRef; }
public String getIndex() { return index; }
public String getDescription() { return description; }
public String getControl() { return control; }
public String getRecipe() { return recipe; }
public String getOperator() { return operator; }
public String getProject() { return project; }
public String toString() {
StringBuilder builder = new StringBuilder(150);
builder
.append(flowcellId) .append(",")
.append(lane) .append(",")
.append(sampleId) .append(",")
.append(sampleRef) .append(",")
.append(index) .append(",")
.append(description) .append(",")
.append(control) .append(",")
.append(recipe) .append(",")
.append(operator) .append(",")
.append(project);
return builder.toString();
}
public static Entry createEntry(final EnumMap<Heading, Integer> columnIndices, String[] fields)
{
Entry e = new Entry();
Integer idx;
idx = columnIndices.get(Heading.fcid);
e.setFlowcellId( idx == null ? null : fields[idx] );
idx = columnIndices.get(Heading.sampleid);
e.setSampleId( idx == null ? null : fields[idx] );
idx = columnIndices.get(Heading.sampleref);
e.setSampleRef( idx == null ? null : fields[idx] );
idx = columnIndices.get(Heading.index);
e.setIndex( idx == null ? null : fields[idx] );
idx = columnIndices.get(Heading.description);
e.setDescription( idx == null ? null : fields[idx] );
idx = columnIndices.get(Heading.control);
e.setControl( idx == null ? null : fields[idx] );
idx = columnIndices.get(Heading.recipe);
e.setRecipe( idx == null ? null : fields[idx] );
idx = columnIndices.get(Heading.operator);
e.setOperator( idx == null ? null : fields[idx] );
idx = columnIndices.get(Heading.lane);
e.setLane( idx == null ? null : Integer.parseInt( fields[idx]) );
idx = columnIndices.get(Heading.sampleproject);
e.setProject( idx == null ? null : fields[idx] );
return e;
}
protected void setFlowcellId (String v) { flowcellId = v; }
protected void setSampleId (String v) { sampleId = v; }
protected void setSampleRef (String v) { sampleRef = v; }
protected void setDescription(String v) { description = v; }
protected void setControl (String v) { control = v; }
protected void setRecipe (String v) { recipe = v; }
protected void setOperator (String v) { operator = v; }
protected void setProject (String v) { project = v; }
protected void setLane(int v) {
if (v <= 0)
throw new IllegalArgumentException("Invalid lane number: " + lane + ". Expecting a number > 0");
lane = v;
}
protected void setIndex(String v)
{
if (v != null)
{
if ( !v.isEmpty() && ( v.length() < BAR_CODE_MIN_LENGTH || v.length() > BAR_CODE_MAX_LENGTH ) )
throw new IllegalArgumentException("Unexpected length for bar code sequence '" + v + "' (length " + v.length() + ", expected in [" + BAR_CODE_MIN_LENGTH + "," + BAR_CODE_MAX_LENGTH + "])");
index = v.toUpperCase();
}
else
index = v;
}
}
}