// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal. If not, see <http://www.gnu.org/licenses/>.
package it.crs4.seal.recab;
import it.crs4.seal.common.FormatException;
import it.crs4.seal.common.CutString;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class ArrayVariantTable implements VariantTable
{
private static final Log LOG = LogFactory.getLog(ArrayVariantTable.class);
private static final int InitialCapacityPerChr = 400000;
protected static class IntSortedArray {
private static final int CompactionThreshold = 50*1024 / 4; // 50 KB
protected int[] data;
protected int size;
public IntSortedArray()
{
data = new int[1000];
size = 0;
}
public IntSortedArray(int initialStorage)
{
data = new int[initialStorage];
size = 0;
}
/**
* Add the position to the list of variant positions.
* This method is optimized to receive positions in increasing order.
*/
public IntSortedArray add(int value)
{
// find the insert position
int ipos = size - 1; // insert position at last element
while (ipos >= 0 && data[ipos] > value)
--ipos;
// either ipos < 0 or data[ipos] <= value
if (ipos < 0 || data[ipos] != value)
{
// Value not already in array so we have to make an insertion
growIfNecessary(); // ensure room for 1 new element
if (ipos < size - 1)
{
// insert before the last element. We need to shift some elements down
System.arraycopy(data, ipos + 1, data, ipos + 2, size - ipos - 1);
}
data[ipos+1] = value;
size += 1;
}
return this;
}
protected void growIfNecessary()
{
if (size >= data.length)
{
// grow the array
int[] newData = new int[data.length*2];
System.arraycopy(data, 0, newData, 0, size);
data = newData;
}
}
public int size() { return size; }
public boolean contains(int element)
{
return Arrays.binarySearch(data, 0, size, element) >= 0;
}
public void compact()
{
if (data.length - size > CompactionThreshold)
data = Arrays.copyOf(data, size);
}
}
/**
* Main data structure.
* We use a Map with one entry per contig/chromosome.
*/
// XXX: save some memory with Integer as opposed to Long. We'll be fine with
// the human genome, but large genomes would be a problem.
//
// TODO: Can we be more clever in the way we use store these things to save some memory?
protected Map< String, IntSortedArray > data;
public boolean isVariantLocation(String chr, long pos)
{
if (pos > Integer.MAX_VALUE)
throw new RuntimeException("pos bigger than expected! File a bug!!");
IntSortedArray list = data.get(chr);
if (list != null)
return list.contains((int)pos);
else
return false;
}
public void load(VariantReader reader) throws IOException, FormatException
{
data = new HashMap< String, IntSortedArray >(30); // initial capacity for ok for human genome plus a few extra contigs
VariantRegion snp = new VariantRegion();
long count = 0;
while (reader.nextEntry(snp)) // snp is re-used
{
// col 1
String chr = snp.getContigName();
IntSortedArray list = data.get(chr);
if (list == null)
{
list = new IntSortedArray(InitialCapacityPerChr);
data.put(chr, list);
}
int end = snp.getPosition() + snp.getLength();
for (int pos = snp.getPosition(); pos < end; ++pos)
list.add(pos);
count += 1;
if (LOG.isInfoEnabled())
{
if (count % 1000000 == 0)
LOG.info("Loaded " + count);
}
}
for (IntSortedArray array: data.values())
array.compact();
LOG.info("Loaded a total of " + count + " known variations");
}
public int size()
{
int sum = 0;
if (data != null)
{
for (IntSortedArray s: data.values())
sum += s.size();
}
return sum;
}
public Set<String> getContigs()
{
return data.keySet();
}
}