/*******************************************************************************
* GenPlay, Einstein Genome Analyzer
* Copyright (C) 2009, 2014 Albert Einstein College of Medicine
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu>
* Nicolas Fourel <nicolas.fourel@einstein.yu.edu>
* Eric Bouhassira <eric.bouhassira@einstein.yu.edu>
*
* Website: <http://genplay.einstein.yu.edu>
******************************************************************************/
package edu.yu.einstein.genplay.core.multiGenome.operation.synchronization;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import edu.yu.einstein.genplay.core.manager.project.MultiGenomeProject;
import edu.yu.einstein.genplay.core.manager.project.ProjectChromosomes;
import edu.yu.einstein.genplay.core.manager.project.ProjectManager;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFLine;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFFile.VCFFile;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFScanner.VCFGenomeScanner;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFScanner.VCFScanner;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFScanner.VCFScannerReceiver;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFStatistics.VCFFileStatistics;
import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFStatistics.VCFSampleStatistics;
import edu.yu.einstein.genplay.core.multiGenome.data.display.content.MGLineContent;
import edu.yu.einstein.genplay.core.multiGenome.data.synchronization.MGSAllele;
import edu.yu.einstein.genplay.core.multiGenome.data.synchronization.MGSGenome;
import edu.yu.einstein.genplay.core.multiGenome.data.synchronization.MGSOffset;
import edu.yu.einstein.genplay.core.multiGenome.filter.MGFilter;
import edu.yu.einstein.genplay.core.multiGenome.utils.FormattedMultiGenomeName;
import edu.yu.einstein.genplay.core.multiGenome.utils.VCFLineUtility;
import edu.yu.einstein.genplay.dataStructure.chromosome.Chromosome;
import edu.yu.einstein.genplay.dataStructure.enums.VariantType;
import edu.yu.einstein.genplay.dataStructure.list.chromosomeWideList.offsetList.IntArrayAsOffsetList;
/**
* @author Nicolas Fourel
* @version 0.1
*/
public class MGSynchronizer implements VCFScannerReceiver, Serializable {
/** Default serial version ID */
private static final long serialVersionUID = -6135675382773268961L;
private static final int SAVED_FORMAT_VERSION_NUMBER = 0; // saved format version
/** Index for reference */
public final static int REFERENCE = -1;
/** Index for no call */
public final static int NO_CALL = -2;
private final MultiGenomeProject multiGenomeProject;
// Attributes used for the synchronization
private Map<Chromosome, Integer> chromosomeIndexes;
private List<List<MGSOffset>> referenceOffsetList;
private VCFFile currentFile;
private VCFFileStatistics currentStatistics;
private List<String> currentGenomes;
/**
* Constructor of {@link MGSynchronizer}
* @param multiGenomeProject the multi genome instance
*/
public MGSynchronizer (MultiGenomeProject multiGenomeProject) {
this.multiGenomeProject = multiGenomeProject;
referenceOffsetList = multiGenomeProject.getMultiGenome().getReferenceGenome().getAllele().getOffsetList();
}
/**
* Creates an offset
* @param offsetList the full list of offsets
* @param currentOffset the offset being processed
* @param lastRefPosition the last reference genome position
* @param additionalLength the total length of all insertion between two deletion
* @param additionalValue only in the case where two or more insertion happen at the same position and the one of the actual offset is not the longest one
* @return a new offset corresponding to the current process
*/
private MGSOffset getNewOffset (List<MGSOffset> offsetList, MGSOffset currentOffset, int lastRefPosition, int additionalLength, int additionalValue) {
MGSOffset offset = null;
int newGenomePosition = 0;
int newOffsetValue = 0;
if (offsetList.size() == 0) {
newGenomePosition = currentOffset.getPosition() + additionalLength;
} else {
newGenomePosition = (currentOffset.getPosition() - lastRefPosition) + getOffset(offsetList).getPosition() + additionalLength;
newOffsetValue = getOffset(offsetList).getValue();
}
newOffsetValue += Math.abs(currentOffset.getValue());
newGenomePosition += additionalValue + 1;
offset = new MGSOffset(newGenomePosition, newOffsetValue);
return offset;
}
/**
* Look for an offset in an offset list according to an index.
* Checks if the index is valid and return the offset.
* @param offsetList the list of offset
* @param index the index of the offset in the list
* @return the offset if the index is valid, null otherwise
*/
private MGSOffset getOffset (List<MGSOffset> offsetList) {
if (offsetList.size() > 0) {
return offsetList.get(offsetList.size() - 1);
}
return null;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// Read files & insert data
/**
* Look for an offset in an offset list according to an index.
* Checks if the index is valid and return the offset.
* @param offsetList the list of offset
* @param index the index of the offset in the list
* @return the offset if the index is valid, null otherwise
*/
private MGSOffset getOffset (List<MGSOffset> offsetList, int index) {
if (index < offsetList.size()) {
return offsetList.get(index);
}
return null;
}
/**
* Retrieves the name of the genomes required in the project that are present in a VCF file.
* A project does not necessary require all genomes present in a VCF file.
* @param vcfFile VCF file
* @return the list of genome names required for the project and present in the VCF file
*/
private List<String> getRequiredGenomeNamesFromAReader (VCFFile vcfFile) {
List<String> requiredGenomeNames = new ArrayList<String>();
List<String> allGenomeNames = multiGenomeProject.getGenomeNames();
List<String> readerGenomeNames = vcfFile.getHeader().getGenomeNames();
for (String readerGenomeName: readerGenomeNames) {
if (allGenomeNames.contains(readerGenomeName)) {
requiredGenomeNames.add(readerGenomeName);
}
}
return requiredGenomeNames;
}
/**
* @param alternative ALT field (or part of it)
* @return true if the given alternative is coded as an SV
*/
private boolean isStructuralVariant (String alternative) {
if (alternative.charAt(0) == '<') {
return true;
}
return false;
}
/**
* Defines if a variant is heterozygote according to its genotype.
* @param firstAlleleNumber number related to the "first" allele
* @param secondAlleleNumber number related to the "second" allele
* @return true if the variant is heterozygote, false otherwise
*/
private boolean isVariantHeterozygote (int firstAlleleNumber, int secondAlleleNumber) {
if (firstAlleleNumber != secondAlleleNumber) {
if ((firstAlleleNumber >= 0) || (secondAlleleNumber >= 0)) {
return true;
}
}
return false;
}
/**
* Defines if a variant is homozygote according to its genotype.
* @param firstAlleleNumber number related to the "first" allele
* @param secondAlleleNumber number related to the "second" allele
* @return true if the variant is homozygote, false otherwise
*/
private boolean isVariantHomozygote (int firstAlleleNumber, int secondAlleleNumber) {
if ((firstAlleleNumber == secondAlleleNumber) && (firstAlleleNumber >= 0)) {
return true;
}
return false;
}
/**
* Performs the synchronization for every genome of the project.
* The process is separated on 3 levels:
* - genomes
* - alleles
* - chromosomes
* (Makes the reading easier)
*/
public void performPositionSynchronization () {
referenceOffsetList = multiGenomeProject.getMultiGenome().getReferenceGenome().getAllele().getOffsetList();
synchronizeToGenomesLevel();
multiGenomeProject.getMultiGenome().getReferenceGenome().synchronizePosition();
}
/**
* Processes all files and insert required synchronization data into memory.
* @param genomes
* @param variations
* @param filters
* @throws IOException
*/
public void processFiles (List<String> genomes, List<VariantType> variations, List<MGFilter> filters) throws IOException {
List<VCFFile> VCFFileList = multiGenomeProject.getAllVCFFiles(); // get all vcf readers
for (VCFFile vcfFile: VCFFileList) {
chromosomeIndexes = new HashMap<Chromosome, Integer>();
currentFile = vcfFile;
currentStatistics = vcfFile.getStatistics();
currentGenomes = getRequiredGenomeNamesFromAReader(vcfFile);
VCFScanner scanner = new VCFGenomeScanner(this, vcfFile);
scanner.setGenomes(genomes);
scanner.setVariations(variations);
scanner.setFilters(filters);
scanner.compute();
currentStatistics.processStatistics();
}
multiGenomeProject.getFileContentManager().compact();
}
@Override
public void processLine(VCFLine line) {
// Set global information
Chromosome chromosome = line.getChromosome();
if (chromosome == null) {
return;
}
int referencePosition = line.getReferencePosition(); // get the reference genome position (POS field)
updateFileStatistics(currentStatistics, line.getAlternativesTypes(), line.getAlternatives());
// Set position information
MGLineContent position = new MGLineContent();
position.setReferenceGenomePosition(referencePosition);
position.setScore(line.getQuality());
position.setAlternatives(line.getAlternativesLength());
Map<String, byte[]> genotypes = new HashMap<String, byte[]>();
// Start genome scanning
for (String genomeName: currentGenomes) { // loop on every genome raw name
MGSGenome genome = multiGenomeProject.getMultiGenome().getGenomeInformation(genomeName);
String genomeRawName = FormattedMultiGenomeName.getRawName(genomeName);
String genotype = line.getGenotype(genomeRawName); // get the related format value, split it (colon separated) into an array, get the first value: the genotype
genotype = genotype.replace('|', '/');
String[] currentAltIndexes = genotype.split("/");
byte[] byteGenotypeArray = new byte[currentAltIndexes.length];
for (int i = 0; i < currentAltIndexes.length; i++) {
int currentAltIndex = VCFLineUtility.getAlleleIndex(currentAltIndexes[i]);
switch (currentAltIndex) {
case NO_CALL:
byteGenotypeArray[i] = NO_CALL;
break;
case REFERENCE:
byteGenotypeArray[i] = REFERENCE;
break;
default:
ProjectChromosomes projectChromosomes = ProjectManager.getInstance().getProjectChromosomes();
int chromosomeIndex = projectChromosomes.getIndex(chromosome);
byteGenotypeArray[i] = (byte) currentAltIndex;
int alternativeLength = line.getAlternativesLength()[currentAltIndex]; // we retrieve its length
VariantType variantType = line.getAlternativesTypes()[currentAltIndex]; // get the type of variant according to the length of the variation
updateVariationSampleStatistics(currentStatistics.getSampleStatistics(genomeName), variantType, line.getAlternatives()[currentAltIndex]);
currentFile.addVariantType(genomeName, variantType); // notice the reader of the variant type
if (variantType != VariantType.SNPS) {
genome.getAllele(i).getOffsetList().get(chromosomeIndex).add(new MGSOffset(referencePosition, alternativeLength));
if (variantType == VariantType.INSERTION) {
referenceOffsetList.get(chromosomeIndex).add(new MGSOffset(referencePosition, alternativeLength)); // add the offset to the reference genome allele if it is an insertion
}
}
break;
}
}
updateGenotypeSampleStatistics(currentStatistics.getSampleStatistics(genomeName), line.getAlternativesTypes(), byteGenotypeArray);
genotypes.put(genomeName, byteGenotypeArray);
}
// Finish position creation
position.setGenotypes(genotypes);
if (chromosomeIndexes.get(chromosome) == null) {
chromosomeIndexes.put(chromosome, 0);
}
int currentIndex = chromosomeIndexes.get(chromosome);
multiGenomeProject.getFileContentManager().getContent(currentFile, chromosome).addPosition(currentIndex, position);
chromosomeIndexes.put(chromosome, currentIndex + 1);
}
/**
* Method used for unserialization
* @param in
* @throws IOException
* @throws ClassNotFoundException
*/
@SuppressWarnings("unchecked")
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
in.readInt();
chromosomeIndexes = (Map<Chromosome, Integer>) in.readObject();
referenceOffsetList = (List<List<MGSOffset>>) in.readObject();
currentFile = (VCFFile) in.readObject();
currentStatistics = (VCFFileStatistics) in.readObject();
currentGenomes = (List<String>) in.readObject();
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////// Synchronizes the data
/**
* This method manages the synchronization for the chromosome list of list of an allele.
* It manages the chromosome loop in order to process the synchronization for every chromosome of this list.
* @param chromosomeAlleleOffsetList chromosome list of list of an allele
* @return a synchronized chromosome list of list
*/
private List<List<MGSOffset>> synchronizeToAlleleLevel (List<List<MGSOffset>> chromosomeAlleleOffsetList) {
int chromosomeListSize = ProjectManager.getInstance().getProjectChromosomes().getChromosomeList().size(); // get the number of chromosome
List<List<MGSOffset>> list = new ArrayList<List<MGSOffset>>(); // instantiate a new chromosome list of list (to insert the synchronized list of offset)
for (int i = 0; i < chromosomeListSize; i++) { // scan on the number of chromosome (loop on ever chromosome)
List<MGSOffset> referenceOffsetList = this.referenceOffsetList.get(i); // get the list of offset from the reference genome for the current chromosome
List<MGSOffset> alleleOffsetList = chromosomeAlleleOffsetList.get(i); // get the list of offset from the current genome for the current chromosome
List<MGSOffset> alleleOffsetListTmp = synchronizeToChromosomeLevel(referenceOffsetList, alleleOffsetList); // get the synchronized offset list
list.add(alleleOffsetListTmp); // add the synchronized offset list to the chromosome list of list
}
return list; // return the chromosome list of list recently created
}
/**
* This method performs the synchronization of a list of offset using the offset list from the reference genome.
* @param referenceOffsetList offset list from the reference genome
* @param alleleOffsetList offset list to synchronize
* @return a synchronized list of offset
*/
private List<MGSOffset> synchronizeToChromosomeLevel (List<MGSOffset> referenceOffsetList, List<MGSOffset> alleleOffsetList) {
List<MGSOffset> list = new IntArrayAsOffsetList();
int referenceOffsetIndex = 0; // index of the current offset from the reference genome
int alleleOffsetIndex = 0; // index of the current offset from the genome
int lastRefPosition = 0; // last genome reference position of a variation
int length = 0; // total length of all insertion between two deletion
boolean valid = true;
while (valid) {
MGSOffset currentReferenceOffset = getOffset(referenceOffsetList, referenceOffsetIndex); // get the current reference offset
MGSOffset currentAlleleOffset = getOffset(alleleOffsetList, alleleOffsetIndex); // get the current offset from the current genome
MGSOffset newOffset = null; // declare a new offset
if ((currentReferenceOffset != null) && (currentAlleleOffset != null)) { // if both offset exist
if (currentAlleleOffset.getPosition() == currentReferenceOffset.getPosition()) { // if both position are similar
if (currentReferenceOffset.getValue() > currentAlleleOffset.getValue()) { // if the value from the reference offset is higher than the one from the current allele, it means that an other genome of the project contains a bigger insertion.
newOffset = getNewOffset(list, currentAlleleOffset, lastRefPosition, length, currentAlleleOffset.getValue()); // get the new offset
lastRefPosition = currentReferenceOffset.getPosition() + 1; // the last reference position is the current reference position + 1
length = 0; // reset the length counter
} else if (currentReferenceOffset.getValue() == currentAlleleOffset.getValue()) { // if both value (length) are equal, there is no new offset
length += currentReferenceOffset.getValue(); // but the length must be taken into account
}
referenceOffsetIndex++; // increase the current reference offset
alleleOffsetIndex++; // increase the current genome offset
} else if(currentAlleleOffset.getPosition() < currentReferenceOffset.getPosition()) {
if (currentAlleleOffset.getValue() < 0) { // if the current offset is related to a deletion
newOffset = getNewOffset(list, currentAlleleOffset, lastRefPosition, length, 0); // get the new offset
lastRefPosition = currentAlleleOffset.getPosition() + Math.abs(currentAlleleOffset.getValue()) + 1;
length = 0; // reset the length counter
} else if (currentAlleleOffset.getValue() > 0) { // if the current offset is related to an insertion
length += currentAlleleOffset.getValue(); // the length must be taken into account
}
alleleOffsetIndex++; // increase the current genome offset only
} else if(currentReferenceOffset.getPosition() < currentAlleleOffset.getPosition()) {
newOffset = getNewOffset(list, currentReferenceOffset, lastRefPosition, length, 0); // get the new offset
lastRefPosition = currentReferenceOffset.getPosition() + 1;
length = 0; // reset the length counter
referenceOffsetIndex++; // increase the current reference offset only
}
} else if ((currentReferenceOffset == null) && (currentAlleleOffset != null)) { // if only the offset from the current genome exists
if (currentAlleleOffset.getValue() < 0) { // if the current offset is related to a deletion
newOffset = getNewOffset(list, currentAlleleOffset, lastRefPosition, length, 0); // get the new offset
lastRefPosition = currentAlleleOffset.getPosition() + Math.abs(currentAlleleOffset.getValue()) + 1;
length = 0; // reset the length counter
} else if (currentAlleleOffset.getValue() > 0) { // if the current offset is related to an insertion
length += currentAlleleOffset.getValue(); // the length must be taken into account
}
alleleOffsetIndex++; // increase the current genome offset
} else if ((currentReferenceOffset != null) && (currentAlleleOffset == null)) { // if only the current offset from the reference exists
newOffset = getNewOffset(list, currentReferenceOffset, lastRefPosition, length, 0); // get the new offset
lastRefPosition = currentReferenceOffset.getPosition() + 1;
length = 0; // reset the length counter
referenceOffsetIndex++; // increase the current reference offset only
} else {
valid = false;
}
if (newOffset != null) {
list.add(newOffset);
}
}
return list;
}
/**
* This method manages the position synchronization for every genome.
* It handles the genomes loop in order to process the synchronization for both alleles of each of them.
*/
private void synchronizeToGenomesLevel () {
for (String genomeName: multiGenomeProject.getGenomeNames()) { // scan on every genome
MGSGenome genomeInformation = multiGenomeProject.getMultiGenome().getGenomeInformation(genomeName); // current genome information
List<MGSAllele> alleles = genomeInformation.getAlleles();
for (MGSAllele allele: alleles) {
List<List<MGSOffset>> chromosomeAlleleAOffsetList = synchronizeToAlleleLevel(allele.getOffsetList()); // get the synchronized chromosome list of list for the current allele
allele.setOffsetList(chromosomeAlleleAOffsetList); // set the current chromosome list of list of the current allele with the synchronized one
}
}
}
/**
* Updates statistics related to the file
* @param statistic file statistics
* @param variantTypes variant type array
* @param alternatives alternatives array
*/
private void updateFileStatistics (VCFFileStatistics statistic, VariantType[] variantTypes, String[] alternatives) {
statistic.incrementNumberOfLines();
for (int i = 0; i < variantTypes.length; i++) {
if (variantTypes[i] == VariantType.SNPS) {
statistic.incrementNumberOfSNPs();
} else if (variantTypes[i] == VariantType.INSERTION) {
if (isStructuralVariant(alternatives[i])) {
statistic.incrementNumberOfLongInsertions();
} else {
statistic.incrementNumberOfShortInsertions();
}
} else if (variantTypes[i] == VariantType.DELETION) {
if (isStructuralVariant(alternatives[i])) {
statistic.incrementNumberOfLongDeletions();
} else {
statistic.incrementNumberOfShortDeletions();
}
}
}
}
/**
* @param statistic sample statistics
* @param variantTypes array of variant types
* @param firstAlleleNumber number of the first allele
* @param secondAlleleNumber number of the second allele
*/
private void updateGenotypeSampleStatistics (VCFSampleStatistics statistic, VariantType[] variantTypes, byte[] alleleIndexes) {
boolean hemizygote = false;
boolean homozygote = false;
boolean heterozygote = false;
if (alleleIndexes.length > 0) {
if (alleleIndexes.length == 1) {
hemizygote = true;
} else {
homozygote = isVariantHomozygote(alleleIndexes[0], alleleIndexes[1]);
heterozygote = isVariantHeterozygote(alleleIndexes[0], alleleIndexes[1]);
}
for (VariantType variantType: variantTypes) {
if (homozygote) {
if (alleleIndexes[0] > -1) {
if (variantType == VariantType.SNPS) {
statistic.incrementNumberOfHomozygoteSNPs();
} else if (variantType == VariantType.INSERTION) {
statistic.incrementNumberOfHomozygoteInsertions();
} else if (variantType == VariantType.DELETION) {
statistic.incrementNumberOfHomozygoteDeletions();
}
}
} else if (heterozygote) {
if (variantType == VariantType.SNPS) {
statistic.incrementNumberOfHeterozygoteSNPs();
} else if (variantType == VariantType.INSERTION) {
statistic.incrementNumberOfHeterozygoteInsertions();
} else if (variantType == VariantType.DELETION) {
statistic.incrementNumberOfHeterozygoteDeletions();
}
} else if (hemizygote) {
if (variantType == VariantType.SNPS) {
statistic.incrementNumberOfHemizygoteSNPs();
} else if (variantType == VariantType.INSERTION) {
statistic.incrementNumberOfHemizygoteInsertions();
} else if (variantType == VariantType.DELETION) {
statistic.incrementNumberOfHemizygoteDeletions();
}
}
}
}
}
/**
* Updates statistics related to the sample
* @param statistic sample statistics
* @param variantType variant type
* @param alternative alternative
*/
private void updateVariationSampleStatistics (VCFSampleStatistics statistic, VariantType variantType, String alternative) {
if (variantType == VariantType.SNPS) {
statistic.incrementNumberOfSNPs();
} else if (variantType == VariantType.INSERTION) {
if (isStructuralVariant(alternative)) {
statistic.incrementNumberOfLongInsertions();
} else {
statistic.incrementNumberOfShortInsertions();
}
} else if (variantType == VariantType.DELETION) {
if (isStructuralVariant(alternative)) {
statistic.incrementNumberOfLongDeletions();
} else {
statistic.incrementNumberOfShortDeletions();
}
}
}
/**
* Method used for serialization
* @param out
* @throws IOException
*/
private void writeObject(ObjectOutputStream out) throws IOException {
out.writeInt(SAVED_FORMAT_VERSION_NUMBER);
out.writeObject(chromosomeIndexes);
out.writeObject(referenceOffsetList);
out.writeObject(currentFile);
out.writeObject(currentStatistics);
out.writeObject(currentGenomes);
}
}