// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.demux;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Collection;
import java.util.Collections;
public class BarcodeLookup
{
// defaults
protected static final int InitNumLanes = 8;
protected static final int InitNumIndexTags = 12;
protected static final int MaxReasonableMismatches = 10;
// protected fields
/*
* The table an ArrayList, where value at i corresponds to lane i+1.
* Each position contains a HashMap that maps DNA tags to Match objects, for that lane.
*/
protected ArrayList< HashMap<String, Match> > table;
/*
* Some lanes only contain one sample. For these lanes, getSampleId() will
* always return the only Match object available. Since getting it out of
* the HashMap in the table is a little contrived, we cache those in the
* following array and return them when needed. For lanes that are empty or
* contain more than one sample the loneSampleCache contains null.
*/
protected ArrayList< Match > loneSampleCache;
protected int nSamples = 0;
protected SubstitutionGenerator gen = new SubstitutionGenerator();
public BarcodeLookup()
{
// Create an empty table for consistency. It will be trashed when
// a samplesheet is loaded.
table = new ArrayList< HashMap<String, Match> >(0);
}
public BarcodeLookup(SampleSheet sheet, int maxMismatches)
{
load(sheet, maxMismatches);
}
public void load(SampleSheet sheet, int maxMismatches)
{
if (maxMismatches < 0)
throw new IllegalArgumentException("maximum number of acceptable mismatches must not be negative (got " + maxMismatches + ")");
if (maxMismatches > MaxReasonableMismatches)
{
throw new IllegalArgumentException("maximum number of acceptable mismatches is too high (got " + maxMismatches +
", limit set to " + MaxReasonableMismatches +
"). If you need to change the limit modify the value of MaxReasonableMismatches in the code and recompile.");
}
// Determine which lanes contain more than one sample.
ArrayList<Integer> laneSampleCount = getLaneSampleCount(sheet);
// now construct the barcode table
table = new ArrayList< HashMap<String, Match> >(0);
nSamples = 0;
for (SampleSheet.Entry e: sheet)
{
// for lanes that only have one sample, only insert the exact match
if (laneSampleCount.get(e.getLane() - 1) <= 1)
insertRecord(e, 0);
else
insertRecord(e, maxMismatches);
}
// cache Match objects for lanes with only one sample.
// Instantiate the cache array and fill with nulls.
loneSampleCache = new ArrayList<Match>(laneSampleCount.size());
loneSampleCache.addAll(Collections.nCopies(laneSampleCount.size(), (Match)null));
for (int index = 0; index < table.size(); ++index)
{
if (table.get(index).size() == 1)
loneSampleCache.set(index, table.get(index).values().iterator().next());
}
}
/**
* Get the sample id corresponding to the given lane and index sequence.
*
* If this BarcodeLookup was loaded specifying a mismatch limit, this method
* will apply that setting and return a sample id that is within the given
* number of mismatches.
*
* In addition,
* <b>if in the loaded sample sheet a sample appears alone in a lane, it will
* be returned for any query on that lane (regardless of index sequence).
* For such queries the returned Match object will specify 0 mismatches</b>.
*/
public Match getSampleId(int lane, String indexSeq)
{
if (lane <= 0)
throw new IllegalArgumentException("Invalid negative lane number " + lane);
if (indexSeq == null)
indexSeq = ""; // turn null into an empty string
if ( !indexSeq.isEmpty() && ( indexSeq.length() < SampleSheet.BAR_CODE_MIN_LENGTH || indexSeq.length() > SampleSheet.BAR_CODE_MAX_LENGTH ) )
{
throw new IllegalArgumentException("Unexpected length of index sequence '" + indexSeq +
"' (expected in [" + SampleSheet.BAR_CODE_MIN_LENGTH + "," + SampleSheet.BAR_CODE_MAX_LENGTH + "]) or an empty string)");
}
// turn tag to uppercase
indexSeq = indexSeq.toUpperCase();
int index = lane - 1;
if (index < table.size())
{
Map<String, Match> map = table.get(index);
if (map.size() == 1)
return loneSampleCache.get(index);
else
return map.get(indexSeq); // will return null if the indexSeq isn't in the Map
}
else
return null;
}
/**
* Count the number of samples in each lane.
*
* @return An array where element i corresponds to lane i+1.
*/
protected static ArrayList<Integer> getLaneSampleCount(SampleSheet sheet)
{
if (sheet.isEmpty())
return new ArrayList<Integer>(0);
// Since we can't know the number of lanes without scanning the
// sample sheet, we allocate the upper bound--i.e., the number
// of entries in the sample sheet.
ArrayList<Integer> laneSampleCount = new ArrayList<Integer>(sheet.size());
for (SampleSheet.Entry e: sheet)
{
int index = e.getLane() - 1;
if (index >= laneSampleCount.size())
laneSampleCount.addAll(Collections.nCopies(index - laneSampleCount.size() + 1, 0));
laneSampleCount.set(index, laneSampleCount.get(index) + 1);
}
return laneSampleCount;
}
public int getNumSamples() { return nSamples; }
public boolean isEmpty() { return nSamples == 0; }
public static class Match
{
private SampleSheet.Entry entry;
private int mismatches;
public Match(SampleSheet.Entry e, int mismatches)
{
entry = e;
this.mismatches = mismatches;
}
public SampleSheet.Entry getEntry() { return entry; }
public int getMismatches() { return mismatches; }
public String toString() {
if (entry == null)
return "(NULL,)";
else
return "(" + entry.getSampleId() + "," + mismatches + ")";
}
}
protected void insertRecord(SampleSheet.Entry entry, int maxMismatches)
{
int lane = entry.getLane();
// grow the array if necessary for this entry's lane
int index = lane - 1;
if (table.size() < lane)
{
int grow_by = lane - table.size();
for (int i = 0; i < grow_by; ++i)
table.add(new HashMap<String, Match>(InitNumIndexTags));
}
// insert the entry provided into the lookup table, and all its
// acceptable mismatches.
gen.insertSubstitutions(entry, maxMismatches, table.get(index));
nSamples += 1;
}
/**
* Implements a DFS that generates tags with substitution errors.
*/
protected static class SubstitutionGenerator
{
private static final char[] Alphabet = { 'A', 'C', 'G', 'T', 'N' };
/* Search state variables, set by genSubstitutions(), used as "globals" in generate()
* and then reset to null before genSubstitutions exits.
*/
/* Entry on which we're working */
private SampleSheet.Entry entry;
/* original Tag from Entry, cached as a character array */
private char[] originalTag;
/* current tag with substitutions (state of the DFS) */
private char[] alteredTag;
/* */
private Map<String, Match> results;
/* match objects are cached and reused, so all altered tags for the same entry with the
* same number of mismatches use the same Match object.
*/
private List<Match> matches;
public void insertSubstitutions(SampleSheet.Entry e, int maxSubstitutions, Map<String, Match> resultMap)
{
String tag = e.getIndex();
if (maxSubstitutions < 0)
throw new IllegalArgumentException("maxSubstitutions must be greater than or equal to zero (got " + maxSubstitutions + ")");
// can't perform more substitutions than the length of the tag
maxSubstitutions = Math.min(maxSubstitutions, tag.length());
try {
matches = new ArrayList<Match>(5);
entry = e;
alteredTag = tag.toCharArray();
originalTag = tag.toCharArray();
results = resultMap;
insertMatch(tag, 0); // insert original tag
// then generate substitutions
if (maxSubstitutions > 0)
generate(0, 0, maxSubstitutions);
}
finally {
matches = null;
entry = null;
alteredTag = null;
originalTag = null;
results = null;
}
}
private void insertMatch(String tag, int numMismatches)
{
// check for duplicates
Match previous = results.get(tag);
if (previous != null)
{
throw new RuntimeException(
"Mismatch limit is too high. " + numMismatches + " mismatches with barcode " +
entry.getIndex() + " can result in barcode " + tag +
", which conflicts with barcode " + previous.getEntry().getIndex() + " with " +
previous.getMismatches() + " mismatches");
}
if (numMismatches >= matches.size())
{
// If necessary, create a new Match object for this Entry with this number of mismatches.
// The Match object will be cached in the matches List.
for (int m = matches.size(); m <= numMismatches; ++m)
matches.add(new Match(entry, m));
}
results.put(tag, matches.get(numMismatches));
}
/**
* Recursive method to generate tags with substitution errors.
* The method implements a depth-first traversal of a tree, rooted at the original
* tag, where each branch is a different substitution applied to the root of the branch.
* Therefore, the level is equal to the number of substitutions applied to the tag.
*
* The state of the traversal is kept in this object's instance variables.
*
* @param startPos Position within alteredTag at which to start inserting errors.
* @param subsDone Number of substitutions already inserted, to be compared to maxSubstitutions.
* @param maxSubstitutions Limit to subsDone. This private function takes for granted that
* the caller has verified that maxSubstitutions is less than or equal to originalTag.length.
*
* @return Tags with substitutions are appended to this.resultList
*/
private void generate(int startPos, int subsDone, int maxSubstitutions)
{
subsDone += 1; // with this call to generate, we'll have done subsDone+1 substitutions to the tag
// generate another substitution. Start from startPos and onwards
for (int pos = startPos; pos < originalTag.length; ++pos)
{
for (int sub = 0; sub < Alphabet.length; ++sub)
{
if (originalTag[pos] != Alphabet[sub]) // don't substitute with itself
{
alteredTag[pos] = Alphabet[sub];
insertMatch(new String(alteredTag), subsDone);
if (subsDone < maxSubstitutions)
generate(pos+1, subsDone, maxSubstitutions);
// restore original value at this position as we back out of this branch of the substitution tree
alteredTag[pos] = originalTag[pos];
}
}
}
}
}
}