/******************************************************************************* * GenPlay, Einstein Genome Analyzer * Copyright (C) 2009, 2014 Albert Einstein College of Medicine * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu> * Nicolas Fourel <nicolas.fourel@einstein.yu.edu> * Eric Bouhassira <eric.bouhassira@einstein.yu.edu> * * Website: <http://genplay.einstein.yu.edu> ******************************************************************************/ package edu.yu.einstein.genplay.core.multiGenome.data.display; import java.util.ArrayList; import java.util.Collections; import java.util.List; import edu.yu.einstein.genplay.core.comparator.VariantComparator; import edu.yu.einstein.genplay.core.manager.project.ProjectManager; import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFFile.VCFFile; import edu.yu.einstein.genplay.core.multiGenome.data.display.content.MGChromosomeContent; import edu.yu.einstein.genplay.core.multiGenome.data.display.content.MGFileContentManager; import edu.yu.einstein.genplay.core.multiGenome.data.display.content.MGLineContent; import edu.yu.einstein.genplay.core.multiGenome.data.display.variant.DeletionVariant; import edu.yu.einstein.genplay.core.multiGenome.data.display.variant.InsertionVariant; import edu.yu.einstein.genplay.core.multiGenome.data.display.variant.NoCallVariant; import edu.yu.einstein.genplay.core.multiGenome.data.display.variant.ReferenceVariant; import edu.yu.einstein.genplay.core.multiGenome.data.display.variant.SNPVariant; import edu.yu.einstein.genplay.core.multiGenome.data.display.variant.Variant; import edu.yu.einstein.genplay.core.multiGenome.operation.synchronization.MGSynchronizer; import edu.yu.einstein.genplay.core.multiGenome.utils.FormattedMultiGenomeName; import edu.yu.einstein.genplay.core.multiGenome.utils.ShiftCompute; import edu.yu.einstein.genplay.dataStructure.chromosome.Chromosome; import edu.yu.einstein.genplay.dataStructure.enums.VariantType; /** * The {@link VariantDisplayListBuilder} builds all lists of {@link Variant} for all allele. * It is genome and {@link Chromosome} specific and gather information from all necessary {@link VCFFile}. * * @author Nicolas Fourel * @version 0.1 */ public class VariantDisplayListBuilder { private final MGFileContentManager contentManager; // The file content manager. private final String referenceGenomeName; // The name of the reference genome. private final String metaGenomeName; // The name of the meta genome. private List<List<Variant>> variants; // The lists of variants. private String genomeName; // The name of the genome to process. private List<VariantType> types; // The list of variant types to handle. private Chromosome chromosome; // The chromosome to process. private MGChromosomeContent currentContent; // The current chromosome content (when processing only) /** * Constructor of {@link VariantDisplayListBuilder} */ public VariantDisplayListBuilder () { contentManager = ProjectManager.getInstance().getMultiGenomeProject().getFileContentManager(); referenceGenomeName = ProjectManager.getInstance().getAssembly().getDisplayName(); metaGenomeName = FormattedMultiGenomeName.META_GENOME_NAME; } /** * Adds a list to another one * @param list01 list receiving the other list * @param list02 list containing the elements to add */ private void addFromListToList (List<List<Variant>> list01, List<List<Variant>> list02) { // Add missing lists int add = list02.size() - list01.size(); for (int i = 0; i < add; i++) { list01.add(new ArrayList<Variant>()); } for (int i = 0; i < list02.size(); i++) { list01.get(i).addAll(list02.get(i)); } } /** * Adjust the size of the {@link ReferenceVariant}. * They have been added earlier but they have to be shrinked in case of overlaps. * @param list a list of {@link Variant} * @return a list of {@link Variant} without overlap of {@link ReferenceVariant} */ private List<List<Variant>> adjustVariants (List<List<Variant>> list) { List<List<Variant>> newList = new ArrayList<List<Variant>>(); for (List<Variant> current: list) { List<Variant> newCurrent = new ArrayList<Variant>(); if (current.size() == 1) { newCurrent.add(current.get(0)); } else if (current.size() > 1) { Collections.sort(current, new VariantComparator()); int currentIndex = 1; Variant previousVariant = current.get(0); Variant currentVariant = current.get(currentIndex); newCurrent.add(previousVariant); while (currentVariant != null) { if (currentVariant instanceof ReferenceVariant) { if (!(previousVariant instanceof ReferenceVariant)) { currentVariant.setStart(previousVariant.getStop()); if (currentVariant.getLength() > 0) { newCurrent.add(currentVariant); } } } else { newCurrent.add(currentVariant); } currentIndex++; if (currentIndex < current.size()) { previousVariant = current.get(currentIndex - 1); currentVariant = current.get(currentIndex); } else { previousVariant = null; currentVariant = null; } } } newList.add(newCurrent); } return newList; } /** * @param line a {@link MGLineContent} * @return true if the given line defines for at least one of the required variation */ private boolean defineVariantType (MGLineContent line) { int[] alternatives = line.getAlternatives(); for (int alternative: alternatives) { if (alternative > 0) { if (types.contains(VariantType.INSERTION)) { return true; } } else if (alternative < 0) { if (types.contains(VariantType.DELETION)) { return true; } } else { if (types.contains(VariantType.SNPS)) { return true; } } } return false; } /** * Add the necessary references of all {@link Variant} of the list * @param list list of {@link Variant} * @return a list of {@link Variant} including references */ private List<List<Variant>> fillWithReferences (List<List<Variant>> list) { List<List<Variant>> newList = getEmptyList(list.size()); for (int i = 0; i < list.size(); i++) { List<Variant> currentList = list.get(i); for (Variant variant: currentList) { newList.get(i).add(variant); Variant reference = new ReferenceVariant(currentContent, variant.getReferencePositionIndex(), variant.getStart(), variant.getStop(), getReferenceType(variant)); for (int j = 0; j < list.size(); j++) { if (j != i) { newList.get(j).add(reference); } } } } return newList; } /** * Create a new genotype based on the native one. * All alternative indexes not required in the original genotype are replaced by reference indexes in the new genotype. * Examples: * - A genotype REF/INS where insertions are requested will be corrected as REF/INS. * - A genotype REF/INS where deletions are requested will be corrected as REF/REF, meaning there is nothing to process here. * - A genotype REF/INS/DEL where insertions are requested will be corrected as REF/INS/REF. * @param line the {@link MGLineContent} * @return the corrected genotype */ private byte[] getAdjustedGenotype (MGLineContent line) { byte[] genotype = line.getGenotypes().get(genomeName); byte[] newGenotype = new byte[genotype.length]; int[] alternatives = line.getAlternatives(); for (int i = 0; i < genotype.length; i++) { if (genotype[i] == MGSynchronizer.NO_CALL) { if (types.contains(VariantType.NO_CALL)) { newGenotype[i] = MGSynchronizer.NO_CALL; } } else if (genotype[i] == MGSynchronizer.REFERENCE) { newGenotype[i] = MGSynchronizer.REFERENCE; } else { boolean insert = false; int alternative = alternatives[genotype[i]]; if (alternative > 0) { if (types.contains(VariantType.INSERTION)) { insert = true; } } else if (alternative < 0) { if (types.contains(VariantType.DELETION)) { insert = true; } } else { if (types.contains(VariantType.SNPS)) { insert = true; } } if (insert) { newGenotype[i] = genotype[i]; } else { newGenotype[i] = MGSynchronizer.REFERENCE; } } } return newGenotype; } /** * Retrieves all {@link Variant} at one specific index and return the dominant one * @param positionindex the index of the position * @return the dominant {@link Variant}, null if no {@link Variant} */ private Variant getCurrentDominantVariant (int positionindex) { List<Variant> variants = currentContent.getVariants().getVariants(positionindex); List<Variant> eligibleVariants = new ArrayList<Variant>(); for (Variant current: variants) { if (types.contains(current.getType())) { eligibleVariants.add(current); } } return getDominantVariant(eligibleVariants); } /** * @param variants a list of {@link Variant} * @return the dominant {@link Variant} among a list of {@link Variant}, null if not found */ private Variant getDominantVariant (List<Variant> variants) { if ((variants == null) || (variants.size() == 0)) { return null; } if (variants.size() == 1) { return variants.get(0); } Variant variant = variants.get(0); for (int i = 1; i < variants.size(); i++) { if (variants.get(i).isDominant(variant)) { variant = variants.get(i); } } return variant; } /** * @param size size of the list * @return an empty list of list with the given size */ private List<List<Variant>> getEmptyList (int size) { List<List<Variant>> result = new ArrayList<List<Variant>>(); for (int i = 0; i < size; i++) { result.add(new ArrayList<Variant>()); } return result; } /** * @param chromosomeContent the {@link MGChromosomeContent} giving the maximum number of alleles * @return an empty list of {@link Variant} according to the maximum number of alleles */ private List<List<Variant>> getEmptyVariantList (MGChromosomeContent chromosomeContent) { int size = chromosomeContent.getMaxGenotypeNumber(); if (size < 2) { size = 2; } List<List<Variant>> list = new ArrayList<List<Variant>>(); for (int i = 0; i < size; i++) { list.add(new ArrayList<Variant>()); } return list; } /** * @param genomeName the name of a genome * @param types a list of {@link VariantType} * @return the list of {@link Variant} for the current chromosome */ public List<List<Variant>> getList (String genomeName, List<VariantType> types) { return getList(genomeName, types, ProjectManager.getInstance().getProjectWindow().getGenomeWindow().getChromosome()); } /** * @param genomeName the name of a genome * @param types a list of {@link VariantType} * @param chromosome a chromosome * @return the list of {@link Variant} */ public List<List<Variant>> getList (String genomeName, List<VariantType> types, Chromosome chromosome) { variants = new ArrayList<List<Variant>>(); this.genomeName = genomeName; this.types = types; this.chromosome = chromosome; List<VCFFile> fileList = getValidFileList(); for (VCFFile file: fileList) { currentContent = contentManager.getContent(file, chromosome); List<List<Variant>> list = getVariantList(); addFromListToList(variants, list); } for (List<Variant> list: variants) { Collections.sort(list, new VariantComparator()); } return variants; } /** * @param variant a {@link Variant} * @return the {@link VariantType} related to a {@link Variant} */ private VariantType getReferenceType (Variant variant) { if (variant instanceof InsertionVariant) { return VariantType.REFERENCE_INSERTION; } else if (variant instanceof DeletionVariant) { return VariantType.REFERENCE_DELETION; } else if (variant instanceof SNPVariant) { return VariantType.REFERENCE_SNP; } else if (variant instanceof NoCallVariant) { return VariantType.REFERENCE_NO_CALL; } return null; } /** * @return the list of {@link VCFFile} able to handle the required variation types for the required genome, an empty list if no files found. */ private List<VCFFile> getValidFileList () { List<VCFFile> validFiles = new ArrayList<VCFFile>(); List<VCFFile> fileList = contentManager.getFileList(); for (VCFFile file: fileList) { boolean hasData = false; for (VariantType type: types) { if (file.canManage(genomeName, type)) { hasData = true; } } if (hasData) { validFiles.add(file); } } return validFiles; } /** * Creates the list of {@link Variant} * @return the list of all {@link Variant} for all alleles */ private List<List<Variant>> getVariantList () { List<List<Variant>> result = getEmptyVariantList(currentContent); int lineNumber = currentContent.getSize(); MGLineContent line = new MGLineContent(); for (int i = 0; i < lineNumber; i++) { line = currentContent.getPosition(line, i); byte[] genotype = line.getGenotypes().get(genomeName); if (defineVariantType(line)) { if (isHomozygoteReference(genotype)) { Variant dominantVariant = getCurrentDominantVariant(i); Variant reference = new ReferenceVariant(currentContent, i, dominantVariant.getStart(), dominantVariant.getStop(), getReferenceType(dominantVariant)); for (List<Variant> currentList: result) { currentList.add(reference); } } else { byte[] correctedGenotype = getAdjustedGenotype(line); if (!isHomozygoteReference(correctedGenotype)) { // the corrected genotype must contain at least one index that is not reference related List<List<Variant>> list = getEmptyList(correctedGenotype.length); for (int j = 0; j < correctedGenotype.length; j++) { Variant variant = null; byte alternativeIndex = correctedGenotype[j]; if (alternativeIndex == MGSynchronizer.NO_CALL) { int start = ShiftCompute.getPosition(referenceGenomeName, null, line.getReferenceGenomePosition(), chromosome, metaGenomeName); variant = new NoCallVariant(currentContent, i, start); } else if (alternativeIndex >= 0) { variant = currentContent.getVariants().getVariant(alternativeIndex, i); } if (variant != null) { list.get(j).add(variant); } } list = fillWithReferences(list); list = adjustVariants(list); addFromListToList(result, list); } } } } return result; } /** * @param genotype a full genotype as an array of bytes * @return true if all alleles define the exact same variation, false otherwise */ private boolean isHomozygote (byte[] genotype) { int length = genotype.length; // A genotype of one/zero element is considered as homozygote if (length < 2) { return true; } // Try to see if one allele has a different alternative than the next one for (int i = 0; i < length; i++) { int nextIndex = i + 1; if (nextIndex < length) { if (genotype[i] != genotype[nextIndex]) { return false; } } } return true; } /** * @param genotype a full genotype as an array of bytes * @return true if all alleles define the reference, false otherwise */ private boolean isHomozygoteReference (byte[] genotype) { if (isHomozygote(genotype)) { return (genotype[0] == MGSynchronizer.REFERENCE); } return false; } }