/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package htsjdk.variant.variantcontext;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
/**
* Represents an ordered collection of Genotype objects
*/
public class GenotypesContext implements List<Genotype> {
/**
* static constant value for an empty GenotypesContext. Useful since so many VariantContexts have no genotypes
*/
public final static GenotypesContext NO_GENOTYPES =
new GenotypesContext(new ArrayList<Genotype>(0), new HashMap<String, Integer>(0), Collections.<String>emptyList()).immutable();
/**
*sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical order
*/
List<String> sampleNamesInOrder = null;
/**
* a map optimized for efficient lookup. Each genotype in genotypes must have its
* sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that
* genotype in the vector of genotypes
*/
Map<String, Integer> sampleNameToOffset = null;
/**
* An ArrayList of genotypes contained in this context
*
* WARNING: TO ENABLE THE LAZY VERSION OF THIS CLASS, NO METHODS SHOULD DIRECTLY
* ACCESS THIS VARIABLE. USE getGenotypes() INSTEAD.
*
*/
ArrayList<Genotype> notToBeDirectlyAccessedGenotypes;
/**
* Cached value of the maximum ploidy observed among all samples
*/
private int maxPloidy = -1;
/** Are we allowing users to modify the list? */
boolean immutable = false;
// ---------------------------------------------------------------------------
//
// private constructors -- you have to use static create methods to make these classes
//
// ---------------------------------------------------------------------------
/**
* Create an empty GenotypeContext
*/
protected GenotypesContext() {
this(10);
}
/**
* Create an empty GenotypeContext, with initial capacity for n elements
*/
protected GenotypesContext(final int n) {
this(new ArrayList<Genotype>(n));
}
/**
* Create an GenotypeContext containing genotypes
*/
protected GenotypesContext(final ArrayList<Genotype> genotypes) {
this.notToBeDirectlyAccessedGenotypes = genotypes;
this.sampleNameToOffset = null;
}
/**
* Create a fully resolved GenotypeContext containing genotypes, sample lookup table,
* and sorted sample names
*
* @param genotypes our genotypes in arbitrary
* @param sampleNameToOffset map optimized for efficient lookup. Each genotype in genotypes must have its
* sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that
* genotype in the vector of genotypes
* @param sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical
* order.
*/
protected GenotypesContext(final ArrayList<Genotype> genotypes,
final Map<String, Integer> sampleNameToOffset,
final List<String> sampleNamesInOrder) {
this.notToBeDirectlyAccessedGenotypes = genotypes;
this.sampleNameToOffset = sampleNameToOffset;
this.sampleNamesInOrder = sampleNamesInOrder;
}
// ---------------------------------------------------------------------------
//
// public static factory methods
//
// ---------------------------------------------------------------------------
/**
* Basic creation routine
* @return an empty, mutable GenotypeContext
*/
public static final GenotypesContext create() {
return new GenotypesContext();
}
/**
* Basic creation routine
* @return an empty, mutable GenotypeContext with initial capacity for nGenotypes
*/
public static final GenotypesContext create(final int nGenotypes) {
return new GenotypesContext(nGenotypes);
}
/**
* Create a fully resolved GenotypeContext containing genotypes, sample lookup table,
* and sorted sample names
*
* @param genotypes our genotypes in arbitrary
* @param sampleNameToOffset map optimized for efficient lookup. Each genotype in genotypes must have its
* sample name in sampleNameToOffset, with a corresponding integer value that indicates the offset of that
* genotype in the vector of genotypes
* @param sampleNamesInOrder a list of sample names, one for each genotype in genotypes, sorted in alphabetical
* order.
* @return an mutable GenotypeContext containing genotypes with already present lookup data
*/
public static final GenotypesContext create(final ArrayList<Genotype> genotypes,
final Map<String, Integer> sampleNameToOffset,
final List<String> sampleNamesInOrder) {
return new GenotypesContext(genotypes, sampleNameToOffset, sampleNamesInOrder);
}
/**
* Create a fully resolved GenotypeContext containing genotypes
*
* @param genotypes our genotypes in arbitrary
* @return an mutable GenotypeContext containing genotypes
*/
public static final GenotypesContext create(final ArrayList<Genotype> genotypes) {
return genotypes == null ? NO_GENOTYPES : new GenotypesContext(genotypes);
}
/**
* Create a fully resolved GenotypeContext containing genotypes
*
* @param genotypes our genotypes in arbitrary
* @return an mutable GenotypeContext containing genotypes
*/
public static final GenotypesContext create(final Genotype... genotypes) {
return create(new ArrayList<Genotype>(Arrays.asList(genotypes)));
}
/**
* Create a freshly allocated GenotypeContext containing the genotypes in toCopy
*
* @param toCopy the GenotypesContext to copy
* @return an mutable GenotypeContext containing genotypes
*/
public static final GenotypesContext copy(final GenotypesContext toCopy) {
return create(new ArrayList<Genotype>(toCopy.getGenotypes()));
}
/**
* Create a GenotypesContext containing the genotypes in iteration order contained
* in toCopy
*
* @param toCopy the collection of genotypes
* @return an mutable GenotypeContext containing genotypes
*/
public static final GenotypesContext copy(final Collection<Genotype> toCopy) {
return toCopy == null ? NO_GENOTYPES : create(new ArrayList<Genotype>(toCopy));
}
// ---------------------------------------------------------------------------
//
// Mutability methods
//
// ---------------------------------------------------------------------------
public final GenotypesContext immutable() {
immutable = true;
return this;
}
public boolean isMutable() {
return ! immutable;
}
public final void checkImmutability() {
if ( immutable )
throw new IllegalAccessError("GenotypeMap is currently immutable, but a mutator method was invoked on it");
}
// ---------------------------------------------------------------------------
//
// caches
//
// ---------------------------------------------------------------------------
protected void invalidateSampleNameMap() {
sampleNameToOffset = null;
}
protected void invalidateSampleOrdering() {
sampleNamesInOrder = null;
}
protected void ensureSampleOrdering() {
if ( sampleNamesInOrder == null ) {
sampleNamesInOrder = new ArrayList<String>(size());
for ( int i = 0; i < size(); i++ ) {
sampleNamesInOrder.add(getGenotypes().get(i).getSampleName());
}
Collections.sort(sampleNamesInOrder);
}
}
protected void ensureSampleNameMap() {
if ( sampleNameToOffset == null ) {
sampleNameToOffset = new HashMap<String, Integer>(size());
for ( int i = 0; i < size(); i++ ) {
sampleNameToOffset.put(getGenotypes().get(i).getSampleName(), i);
}
}
}
// ---------------------------------------------------------------------------
//
// Lazy methods
//
// ---------------------------------------------------------------------------
public boolean isLazyWithData() {
return this instanceof LazyGenotypesContext &&
((LazyGenotypesContext)this).getUnparsedGenotypeData() != null;
}
// ---------------------------------------------------------------------------
//
// Map methods
//
// ---------------------------------------------------------------------------
protected ArrayList<Genotype> getGenotypes() {
return notToBeDirectlyAccessedGenotypes;
}
@Override
public void clear() {
checkImmutability();
invalidateSampleNameMap();
invalidateSampleOrdering();
getGenotypes().clear();
}
@Override
public int size() {
return getGenotypes().size();
}
@Override
public boolean isEmpty() {
return getGenotypes().isEmpty();
}
/**
* Adds a single genotype to this context.
*
* There are many constraints on this input, and important
* impacts on the performance of other functions provided by this
* context.
*
* First, the sample name of genotype must be unique within this
* context. However, this is not enforced in the code itself, through
* you will invalid the contract on this context if you add duplicate
* samples and are running with CoFoJa enabled.
*
* Second, adding genotype also updates the sample name -> index map,
* so add() followed by containsSample and related function is an efficient
* series of operations.
*
* Third, adding the genotype invalidates the sorted list of sample names, to
* add() followed by any of the SampleNamesInOrder operations is inefficient, as
* each SampleNamesInOrder must rebuild the sorted list of sample names at
* an O(n log n) cost.
*
* @param genotype
* @return
*/
@Override
public boolean add(final Genotype genotype) {
checkImmutability();
invalidateSampleOrdering();
if ( sampleNameToOffset != null ) {
// update the name map by adding entries
sampleNameToOffset.put(genotype.getSampleName(), size());
}
return getGenotypes().add(genotype);
}
@Override
public void add(final int i, final Genotype genotype) {
throw new UnsupportedOperationException();
}
/**
* Adds all of the genotypes to this context
*
* See {@link #add(Genotype)} for important information about this functions
* constraints and performance costs
*
* @param genotypes
* @return
*/
@Override
public boolean addAll(final Collection<? extends Genotype> genotypes) {
checkImmutability();
invalidateSampleOrdering();
if ( sampleNameToOffset != null ) {
// update the name map by adding entries
int pos = size();
for ( final Genotype g : genotypes ) {
sampleNameToOffset.put(g.getSampleName(), pos++);
}
}
return getGenotypes().addAll(genotypes);
}
@Override
public boolean addAll(final int i, final Collection<? extends Genotype> genotypes) {
throw new UnsupportedOperationException();
}
@Override
public boolean contains(final Object o) {
return getGenotypes().contains(o);
}
@Override
public boolean containsAll(final Collection<?> objects) {
return getGenotypes().containsAll(objects);
}
private boolean containsAny(final Collection<? extends Genotype> genotypes) {
for ( final Genotype g : genotypes ) {
if ( contains(g) ) return true;
}
return false;
}
@Override
public Genotype get(final int i) {
return getGenotypes().get(i);
}
/**
* What is the max ploidy among all samples? Returns defaultPloidy if no genotypes are present
*
* @param defaultPloidy the default ploidy, if all samples are no-called
* @return
*/
public int getMaxPloidy(final int defaultPloidy) {
if ( defaultPloidy < 0 ) throw new IllegalArgumentException("defaultPloidy must be greater than or equal to 0");
if ( maxPloidy == -1 ) {
maxPloidy = 0; // necessary in the case where there are no genotypes
for ( final Genotype g : getGenotypes() ) {
maxPloidy = Math.max(g.getPloidy(), maxPloidy);
}
// everything is no called so we return the default ploidy
if ( maxPloidy == 0 ) maxPloidy = defaultPloidy;
}
return maxPloidy;
}
/**
* Gets sample associated with this sampleName, or null if none is found
*
* @param sampleName
* @return
*/
public Genotype get(final String sampleName) {
Integer offset = getSampleI(sampleName);
return offset == null ? null : getGenotypes().get(offset);
}
private Integer getSampleI(final String sampleName) {
ensureSampleNameMap();
return sampleNameToOffset.get(sampleName);
}
@Override
public int indexOf(final Object o) {
return getGenotypes().indexOf(o);
}
@Override
public Iterator<Genotype> iterator() {
return getGenotypes().iterator();
}
@Override
public int lastIndexOf(final Object o) {
return getGenotypes().lastIndexOf(o);
}
@Override
public ListIterator<Genotype> listIterator() {
// todo -- must be immutable
throw new UnsupportedOperationException();
// return genotypes.listIterator();
}
@Override
public ListIterator<Genotype> listIterator(final int i) {
// todo -- must be immutable
throw new UnsupportedOperationException();
// return genotypes.listIterator(i);
}
/**
* Note that remove requires us to invalidate our sample -> index
* cache. The loop:
*
* GenotypesContext gc = ...
* for ( sample in samples )
* if ( gc.containsSample(sample) )
* gc.remove(sample)
*
* is extremely inefficient, as each call to remove invalidates the cache
* and containsSample requires us to rebuild it, an O(n) operation.
*
* If you must remove many samples from the GC, use either removeAll or retainAll
* to avoid this O(n * m) operation.
*
* @param i
* @return
*/
@Override
public Genotype remove(final int i) {
checkImmutability();
invalidateSampleNameMap();
invalidateSampleOrdering();
return getGenotypes().remove(i);
}
/**
* See for important warning {@link this.remove(Integer)}
* @param o
* @return
*/
@Override
public boolean remove(final Object o) {
checkImmutability();
invalidateSampleNameMap();
invalidateSampleOrdering();
return getGenotypes().remove(o);
}
@Override
public boolean removeAll(final Collection<?> objects) {
checkImmutability();
invalidateSampleNameMap();
invalidateSampleOrdering();
return getGenotypes().removeAll(objects);
}
@Override
public boolean retainAll(final Collection<?> objects) {
checkImmutability();
invalidateSampleNameMap();
invalidateSampleOrdering();
return getGenotypes().retainAll(objects);
}
@Override
public Genotype set(final int i, final Genotype genotype) {
checkImmutability();
final Genotype prev = getGenotypes().set(i, genotype);
invalidateSampleOrdering();
if ( sampleNameToOffset != null ) {
// update the name map by removing the old entry and replacing it with the new one
sampleNameToOffset.remove(prev.getSampleName());
sampleNameToOffset.put(genotype.getSampleName(), i);
}
return prev;
}
/**
* Replaces the genotype in this context -- note for efficiency
* reasons we do not add the genotype if it's not present. The
* return value will be null indicating this happened.
*
* Note this operation is preserves the map cache Sample -> Offset but
* invalidates the sorted list of samples. Using replace within a loop
* containing any of the SampleNameInOrder operation requires an O(n log n)
* resorting after each replace operation.
*
* @param genotype a non null genotype to bind in this context
* @return null if genotype was not added, otherwise returns the previous genotype
*/
public Genotype replace(final Genotype genotype) {
checkImmutability();
Integer offset = getSampleI(genotype.getSampleName());
if ( offset == null )
return null;
else
return set(offset, genotype);
}
@Override
public List<Genotype> subList(final int i, final int i1) {
return getGenotypes().subList(i, i1);
}
@Override
public Object[] toArray() {
return getGenotypes().toArray();
}
@Override
public <T> T[] toArray(final T[] ts) {
return getGenotypes().toArray(ts);
}
/**
* Iterate over the Genotypes in this context in the order specified by sampleNamesInOrder
*
* @param sampleNamesInOrder a Iterable of String, containing exactly one entry for each Genotype sample name in
* this context
* @return a Iterable over the genotypes in this context.
*/
public Iterable<Genotype> iterateInSampleNameOrder(final Iterable<String> sampleNamesInOrder) {
return new Iterable<Genotype>() {
@Override
public Iterator<Genotype> iterator() {
return new InOrderIterator(sampleNamesInOrder.iterator());
}
};
}
/**
* Iterate over the Genotypes in this context in their sample name order (A, B, C)
* regardless of the underlying order in the vector of genotypes
* @return a Iterable over the genotypes in this context.
*/
public Iterable<Genotype> iterateInSampleNameOrder() {
return iterateInSampleNameOrder(getSampleNamesOrderedByName());
}
private final class InOrderIterator implements Iterator<Genotype> {
final Iterator<String> sampleNamesInOrder;
private InOrderIterator(final Iterator<String> sampleNamesInOrder) {
this.sampleNamesInOrder = sampleNamesInOrder;
}
@Override
public boolean hasNext() {
return sampleNamesInOrder.hasNext();
}
@Override
public Genotype next() {
return get(sampleNamesInOrder.next());
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
/**
* @return The set of sample names for all genotypes in this context, in arbitrary order
*/
public Set<String> getSampleNames() {
ensureSampleNameMap();
return sampleNameToOffset.keySet();
}
/**
* @return The set of sample names for all genotypes in this context, in their natural ordering (A, B, C)
*/
public List<String> getSampleNamesOrderedByName() {
ensureSampleOrdering();
return sampleNamesInOrder;
}
public boolean containsSample(final String sample) {
ensureSampleNameMap();
return sampleNameToOffset.containsKey(sample);
}
public boolean containsSamples(final Collection<String> samples) {
return getSampleNames().containsAll(samples);
}
/**
* Return a freshly allocated subcontext of this context containing only the samples
* listed in samples. Note that samples can contain names not in this context, they
* will just be ignored.
*
* @param samples
* @return
*/
public GenotypesContext subsetToSamples( final Set<String> samples ) {
final int nSamples = samples.size();
if ( nSamples == 0 )
return NO_GENOTYPES;
else { // nGenotypes < nSamples
final GenotypesContext subset = create(samples.size());
for ( final String sample : samples ) {
final Genotype g = get(sample);
if ( g != null )
subset.add(g);
}
return subset;
}
}
@Override
public String toString() {
final List<String> gS = new ArrayList<String>();
for ( final Genotype g : this.iterateInSampleNameOrder() )
gS.add(g.toString());
return "[" + join(",", gS) + "]";
}
// copied from Utils
private static <T> String join(final String separator, final Collection<T> objects) {
if (objects.isEmpty()) { // fast path for empty collection
return "";
} else {
final Iterator<T> iter = objects.iterator();
final T first = iter.next();
if ( ! iter.hasNext() ) // fast path for singleton collections
return first.toString();
else { // full path for 2+ collection that actually need a join
final StringBuilder ret = new StringBuilder(first.toString());
while(iter.hasNext()) {
ret.append(separator);
ret.append(iter.next().toString());
}
return ret.toString();
}
}
}
}