// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.recab;
import it.crs4.seal.common.FormatException;
import it.crs4.seal.common.CutString;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class ArrayListVariantTable implements VariantTable
{
private static final Log LOG = LogFactory.getLog(ArrayListVariantTable.class);
private static final int InitialCapacityPerChr = 400000;
/**
* Main data structure.
* We use a Map with one entry per contig/chromosome.
*/
// XXX: save some memory with Integer as opposed to Long. We'll be fine with
// the human genome, but large genomes would be a problem.
//
// TODO: Can we be more clever in the way we use store these things to save some memory?
protected Map< String, ArrayList<Integer> > data;
public boolean isVariantLocation(String chr, long pos)
{
if (pos > Integer.MAX_VALUE)
throw new RuntimeException("pos bigger than expected! File a bug!!");
ArrayList<Integer> list = data.get(chr);
if (list != null)
return Collections.binarySearch(list, (int)pos) >= 0;
else
return false;
}
public void load(VariantReader reader) throws IOException, FormatException
{
data = new HashMap< String, ArrayList<Integer> >(30); // initial capacity for ok for human genome plus a few extra contigs
VariantRegion snp = new VariantRegion();
long count = 0;
while (reader.nextEntry(snp)) // snp is re-used
{
// col 1
String chr = snp.getContigName();
ArrayList<Integer> list = data.get(chr);
if (list == null)
{
list = new ArrayList<Integer>(InitialCapacityPerChr);
data.put(chr, list);
}
int refpos = snp.getPosition();
int end = refpos + snp.getLength();
// reference positions [refpos,end) are to be inserted as variants
// find the the index of the element after which we want to insert
// our new variant region
int ipos = list.size() - 1;
while (ipos >= 0 && list.get(ipos) >= refpos)
--ipos;
// if ipos at the last element simply append
if (ipos >= list.size() - 1)
{
for (; refpos < end; ++refpos)
list.add(refpos);
}
else
{
// Insert before the last element.
// Increment ipos, so it becomes the index at which to start inserting
ipos += 1;
for (; refpos < end; ++refpos, ++ipos)
{
// for each position in the variant region, if it's not already in
// our list insert it.
if (list.get(ipos) != refpos)
list.add(ipos, refpos);
}
}
count += 1;
if (LOG.isInfoEnabled())
{
if (count % 1000000 == 0)
LOG.info("Loaded " + count);
}
}
LOG.info("Loaded a total of " + count + " known variations");
}
public int size()
{
int sum = 0;
if (data != null)
{
for (List<Integer> s: data.values())
sum += s.size();
}
return sum;
}
public Set<String> getContigs()
{
return data.keySet();
}
}