// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.recab;
import it.crs4.seal.common.FormatException;
import it.crs4.seal.common.CutString;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.HashMap;
import java.util.HashSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class HashSetVariantTable implements VariantTable
{
private static final Log LOG = LogFactory.getLog(HashSetVariantTable.class);
private static final int InitialCapacityPerChr = 400000;
private static final float LoadFactor = 0.80f;
/**
* Main data structure.
* We use a Map with one entry per contig/chromosome.
* For each contig, we have a Set which stores all its SNP positions.
*/
// XXX: save some memory with Integer as opposed to Long. We'll be fine with
// the human genome, but large genomes would be a problem.
//
// TODO: Can we be more clever in the way we use store these things to save some memory?
protected Map< String, Set<Integer> > data;
public boolean isVariantLocation(String chr, long pos)
{
if (pos > Integer.MAX_VALUE)
throw new RuntimeException("pos bigger than expected! File a bug!!");
Set<Integer> s = data.get(chr);
if (s != null)
return s.contains((int)pos);
return false;
}
public void load(VariantReader reader) throws IOException, FormatException
{
data = new HashMap< String, Set<Integer> >(30); // initial capacity for ok for human genome plus a few extra contigs
VariantRegion snp = new VariantRegion();
long count = 0;
while (reader.nextEntry(snp)) // snp is re-used
{
// col 1
String chr = snp.getContigName();
Set<Integer> s = data.get(chr);
if (s == null)
{
s = new HashSet<Integer>(InitialCapacityPerChr, LoadFactor);
data.put(chr, s);
}
int end = snp.getPosition() + snp.getLength();
for (int pos = snp.getPosition(); pos < end; ++pos)
s.add(pos);
count += 1;
if (LOG.isInfoEnabled())
{
if (count % 1000000 == 0)
LOG.info("Loaded " + count);
}
}
LOG.info("Loaded a total of " + count + " known variations");
}
public int size()
{
int sum = 0;
if (data != null)
{
for (Set<Integer> s: data.values())
sum += s.size();
}
return sum;
}
public Set<String> getContigs()
{
return data.keySet();
}
}