/******************************************************************************* * GenPlay, Einstein Genome Analyzer * Copyright (C) 2009, 2014 Albert Einstein College of Medicine * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu> * Nicolas Fourel <nicolas.fourel@einstein.yu.edu> * Eric Bouhassira <eric.bouhassira@einstein.yu.edu> * * Website: <http://genplay.einstein.yu.edu> ******************************************************************************/ package edu.yu.einstein.genplay.core.multiGenome.operation.VCF; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFLine; import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFFile.VCFFile; import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFStatistics.VCFFileFullStatistic; import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFStatistics.VCFFileStatistics; import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFStatistics.VCFSampleStatistics; import edu.yu.einstein.genplay.core.multiGenome.operation.BasicEngine; import edu.yu.einstein.genplay.core.multiGenome.operation.fileScanner.FileScannerInterface; import edu.yu.einstein.genplay.core.multiGenome.operation.fileScanner.SingleFileScanner; import edu.yu.einstein.genplay.core.multiGenome.utils.FormattedMultiGenomeName; import edu.yu.einstein.genplay.dataStructure.enums.AlleleType; import edu.yu.einstein.genplay.dataStructure.enums.VariantType; /** * This method exports a VCF track into a VCF file. * * @author Nicolas Fourel * @version 0.1 */ public class MGOVCFStatisticsSingleFile extends BasicEngine { private VCFFileFullStatistic nativeStatistics; private VCFFileFullStatistic newStatistics; private List<String> genomeNames; private List<AlleleType> alleleTypeList; Map<AlleleType, Integer> alleleIndexMap; @Override protected boolean canStart() throws Exception { List<VCFFile> fileList = getFileList(); if (fileList.size() == 1) { fileScanner = new SingleFileScanner(this); nativeStatistics = fileList.get(0).getStatistics(); newStatistics = new VCFFileFullStatistic(); genomeNames = getGenomeList(); for (String genomeName: genomeNames) { newStatistics.addGenomeName(genomeName); } alleleTypeList = new ArrayList<AlleleType>(); alleleTypeList.add(AlleleType.ALLELE01); alleleTypeList.add(AlleleType.ALLELE02); alleleIndexMap = new HashMap<AlleleType, Integer>(); return true; } System.err.println("VCFExportEngineSingleFile.canStart() number of files invalid: " + fileList.size()); return false; } @Override public void processLine(FileScannerInterface fileAlgorithm) throws IOException { VCFLine currentLine = fileAlgorithm.getCurrentLine(); updateFileStatistics(newStatistics, currentLine.getAlternativesTypes(), currentLine.getAlternatives()); for (String genomeName: genomeNames) { String genomeRawName = FormattedMultiGenomeName.getRawName(genomeName); String genotype = currentLine.getGenotype(genomeRawName); if (genotype.length() == 3) { int alleleIndex01 = getAlleleIndex(genotype.charAt(0)); int alleleIndex02 = getAlleleIndex(genotype.charAt(2)); updateGenotypeSampleStatistics(newStatistics.getSampleStatistics(genomeName), currentLine.getAlternativesTypes(), alleleIndex01, alleleIndex02); for (AlleleType alleleType: alleleTypeList) { int currentAlleleIndex = alleleIndex01; if (alleleType == AlleleType.ALLELE02) { currentAlleleIndex = alleleIndex02; } if (currentAlleleIndex >= 0) { VariantType variantType = currentLine.getAlternativesTypes()[currentAlleleIndex]; updateVariationSampleStatistics(newStatistics.getSampleStatistics(genomeName), variantType, currentLine.getAlternatives()[currentAlleleIndex]); } } } } } @Override protected void process() throws Exception { fileScanner.compute(); } @Override public void processLine(VCFLine src, VCFLine dest) throws IOException {} /** * @return the generated statistics */ public VCFFileStatistics getNewStatistics() { return newStatistics; } /** * @return the statistics of the file */ public VCFFileStatistics getNativeStatistics() { return nativeStatistics; } /** * Updates statistics related to the file * @param statistic file statistics * @param variantTypes variant type array * @param alternatives alternatives array */ private void updateFileStatistics (VCFFileStatistics statistic, VariantType[] variantTypes, String[] alternatives) { statistic.incrementNumberOfLines(); for (int i = 0; i < variantTypes.length; i++) { if (variantTypes[i] == VariantType.SNPS) { statistic.incrementNumberOfSNPs(); } else if (variantTypes[i] == VariantType.INSERTION) { if (isStructuralVariant(alternatives[i])) { statistic.incrementNumberOfLongInsertions(); } else { statistic.incrementNumberOfShortInsertions(); } } else if (variantTypes[i] == VariantType.DELETION) { if (isStructuralVariant(alternatives[i])) { statistic.incrementNumberOfLongDeletions(); } else { statistic.incrementNumberOfShortDeletions(); } } } } /** * @param statistic sample statistics * @param variantTypes array of variant types * @param firstAlleleNumber number of the first allele * @param secondAlleleNumber number of the second allele */ private void updateGenotypeSampleStatistics (VCFSampleStatistics statistic, VariantType[] variantTypes, int firstAlleleIndex, int secondAlleleIndex) { boolean homozygote = isVariantHomozygote(firstAlleleIndex, secondAlleleIndex); boolean heterozygote = isVariantHeterozygote(firstAlleleIndex, secondAlleleIndex); for (VariantType variantType: variantTypes) { if (homozygote) { if (firstAlleleIndex > -1) { if (variantType == VariantType.SNPS) { statistic.incrementNumberOfHomozygoteSNPs(); } else if (variantType == VariantType.INSERTION) { statistic.incrementNumberOfHomozygoteInsertions(); } else if (variantType == VariantType.DELETION) { statistic.incrementNumberOfHomozygoteDeletions(); } } } else if (heterozygote) { if (variantType == VariantType.SNPS) { statistic.incrementNumberOfHeterozygoteSNPs(); } else if (variantType == VariantType.INSERTION) { statistic.incrementNumberOfHeterozygoteInsertions(); } else if (variantType == VariantType.DELETION) { statistic.incrementNumberOfHeterozygoteDeletions(); } } } } /** * Defines if a variant is homozygote according to its genotype. * @param firstAlleleNumber number related to the "first" allele * @param secondAlleleNumber number related to the "second" allele * @return true if the variant is homozygote, false otherwise */ private boolean isVariantHomozygote (int firstAlleleNumber, int secondAlleleNumber) { if ((firstAlleleNumber == secondAlleleNumber) && (firstAlleleNumber >= 0)) { return true; } return false; } /** * Defines if a variant is heterozygote according to its genotype. * @param firstAlleleNumber number related to the "first" allele * @param secondAlleleNumber number related to the "second" allele * @return true if the variant is heterozygote, false otherwise */ private boolean isVariantHeterozygote (int firstAlleleNumber, int secondAlleleNumber) { if (firstAlleleNumber != secondAlleleNumber) { if ((firstAlleleNumber >= 0) || (secondAlleleNumber >= 0)) { return true; } } return false; } /** * Updates statistics related to the sample * @param statistic sample statistics * @param variantType variant type * @param alternative alternative */ private void updateVariationSampleStatistics (VCFSampleStatistics statistic, VariantType variantType, String alternative) { if (variantType == VariantType.SNPS) { statistic.incrementNumberOfSNPs(); } else if (variantType == VariantType.INSERTION) { if (isStructuralVariant(alternative)) { statistic.incrementNumberOfLongInsertions(); } else { statistic.incrementNumberOfShortInsertions(); } } else if (variantType == VariantType.DELETION) { if (isStructuralVariant(alternative)) { statistic.incrementNumberOfLongDeletions(); } else { statistic.incrementNumberOfShortDeletions(); } } } /** * @param alternative ALT field (or part of it) * @return true if the given alternative is coded as an SV */ private boolean isStructuralVariant (String alternative) { if (alternative.charAt(0) == '<') { return true; } return false; } /** * Transforms a character into its allele index. * The char 1 will refer to the first alternative located at the index 0 of any arrays. * The char 0 returns -1 and the char '.' returns -2 and don't refer to any alternatives. * @param alleleChar the character * @return the associated code (char - 1) */ public int getAlleleIndex (char alleleChar) { int alleleIndex = -1; if (alleleChar == '.') { alleleIndex = -2; } else if (alleleChar == '0') { alleleIndex = -1; } else { try { alleleIndex = Integer.parseInt(alleleChar + "") - 1; } catch (Exception e) {} } return alleleIndex; } }