/******************************************************************************* * GenPlay, Einstein Genome Analyzer * Copyright (C) 2009, 2014 Albert Einstein College of Medicine * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * Authors: Julien Lajugie <julien.lajugie@einstein.yu.edu> * Nicolas Fourel <nicolas.fourel@einstein.yu.edu> * Eric Bouhassira <eric.bouhassira@einstein.yu.edu> * * Website: <http://genplay.einstein.yu.edu> ******************************************************************************/ package edu.yu.einstein.genplay.core.multiGenome.operation.fileScanner; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import edu.yu.einstein.genplay.core.manager.project.ProjectManager; import edu.yu.einstein.genplay.core.multiGenome.VCF.BGZIPReader; import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFLine; import edu.yu.einstein.genplay.core.multiGenome.VCF.VCFFile.VCFFile; import edu.yu.einstein.genplay.core.multiGenome.filter.MGFilter; import edu.yu.einstein.genplay.core.multiGenome.operation.synchronization.MGSynchronizer; import edu.yu.einstein.genplay.core.multiGenome.utils.VCFLineUtility; import edu.yu.einstein.genplay.dataStructure.enums.VariantType; import edu.yu.einstein.genplay.util.Utils; /** * This class reads a VCF file (as a gz) and can process some operations on the lines. * It checks if the line is valid according to the requirements (variations and filters). * * @author Nicolas Fourel * @version 0.1 */ public class ManualVCFReader { private final boolean includeReferences; private final boolean includeNoCall; private final VCFFile vcfFile; // The vcf file (.gz) private final BGZIPReader reader; // The gz reader private final List<String> genomeList; // The list of required genomes private final Map<String, List<VariantType>> variationMap; // map between genome names and their required variation private final List<MGFilter> filterList; // list of filter private VCFLine currentLine; // The current VCF line private List<Integer> allValidIndex; // The array that will contain all valid alternative indexes of the line private List<String> allValidGenome; // The array that will contain all valid genome names of the line private final MGSynchronizer synchronizer; // The Multi Genome Synchronizer /** * Constructor of {@link ManualVCFReader} * @param vcfFile the vcf file * @param genomeList the list of genome * @param variationMap the map of variations * @param filterList the list of filters * @param includeReferences include the references (0) * @param includeNoCall include the no call (.) * @throws Exception */ public ManualVCFReader (VCFFile vcfFile, List<String> genomeList, Map<String, List<VariantType>> variationMap, List<MGFilter> filterList, boolean includeReferences, boolean includeNoCall) throws Exception { this.vcfFile = vcfFile; this.genomeList = genomeList; this.variationMap = variationMap; this.filterList = filterList; this.includeReferences = includeReferences; this.includeNoCall = includeNoCall; synchronizer = ProjectManager.getInstance().getMultiGenomeProject().getMultiGenomeSynchronizer(); reader = new BGZIPReader(vcfFile); } /** * Goes to the next line in the file. * @throws IOException */ public void goNextLine () throws IOException { reader.goNextLine(); currentLine = reader.getCurrentLine(); } /** * Goes to the next line in the file. * It processes the line in order to know whether it passes the constraints or not. * The constraints are variations types and filters. * When the line passes the constraints, two lists are created: * - allValidGenome: the list of genome names their variations verify the constraints * - allValidIndex: the list of alternative indexes that verify the constraints * * @return the current VCF line * @throws IOException */ public VCFLine getNextValidLine () throws IOException { goNextLine(); return getCurrentValidLine(); } /** * Goes to the next line in the file. * Does not process anything, just return the next line * * @return the next line in the VCF file * @throws IOException */ public VCFLine getNextLine () throws IOException { goNextLine(); return reader.getCurrentLine(); } /** * Gets and processes the current line in the file. * It processes the line in order to know whether it passes the constraints or not. * The constraints are variations types and filters. * When the line passes the constraints, two lists are created: * - allValidGenome: the list of genome names their variations verify the constraints * - allValidIndex: the list of alternative indexes that verify the constraints * * @return the current line */ public VCFLine getCurrentValidLine () { currentLine = reader.getCurrentLine(); passValidation(currentLine); return currentLine; } /** * It processes the line in order to know whether it passes the constraints or not. * The constraints are variations types and filters. * When the line passes the constraints, two lists are created: * - allValidGenome: the list of genome names their variations verify the constraints * - allValidIndex: the list of alternative indexes that verify the constraints * * @param currentLine a vcf line */ private void passValidation (VCFLine currentLine) { boolean hasPassed = false; if (!currentLine.isLastLine() && currentLine.isValid()) { // The line has to be a valid line to be processed currentLine.processForAnalyse(); int[] lengths = VCFLineUtility.getVariantLengths(currentLine.getREF(), Utils.split(currentLine.getALT(), ','), currentLine.getINFO()); // Retrieves the length of all defined variations of the line VariantType[] variations = VCFLineUtility.getVariantTypes(lengths); // Converts the lengths into variation types (insertion, deletion...) allValidIndex = new ArrayList<Integer>(); // Initializes the array that will contain all valid alternative indexes of the line allValidGenome = new ArrayList<String>(); // Initializes the array that will contain all valid genome names of the line for (int i = 0; i < genomeList.size(); i++) { // Will scan information for all genomes of the line int[] altIndexes = getAlternativeIndexes(genomeList.get(i), reader, synchronizer); // Gets indexes defined by the GT type ('.' is converted as -1) List<Integer> validIndex = getValidIndexes(variationMap.get(genomeList.get(i)), variations, altIndexes); // Only keeps the valid ones (excludes the ones referring to the reference) if (validIndex.size() > 0) { // If we have found at least one valid index (one variant matching the variation requirements) if (isValid(currentLine)) { allValidGenome.add(genomeList.get(i)); // If the process comes here, it means information has been found for the current genome for (int index: validIndex) { // For all found indexes if (!allValidIndex.contains(index)) { // If it does not have been stored yet allValidIndex.add(index); // We store it } } } } } if (allValidGenome.size() > 0) { // If information has been found for at least one genome hasPassed = true; } } currentLine.setHasData(hasPassed); } /** * Retrieves the indexes of the alternatives defined by the genotype field of a genome * @param genomeName the name of the genome * @param reader the reader (to get the GT field) * @param synchronizer the multi genome synchronizer (to convert character as index) * @return an array of two integers containing the indexes */ private int[] getAlternativeIndexes (String genomeName, BGZIPReader reader, MGSynchronizer synchronizer) { int genomeIndex = reader.getIndexFromGenome(genomeName); String genotype = Utils.split(reader.getCurrentLine().getField(genomeIndex), ':')[0]; int size = (int) Math.floor(genotype.length() / 2); int[] indexes = new int[size]; int charIndex = 0; for (int i = 0; i < size; i++) { indexes[i] = VCFLineUtility.getAlleleIndex(genotype.charAt(charIndex) + ""); charIndex += 2; } return indexes; } /** * Compares the required variations and the ones from the line in order to select the correct indexes. * If it has to include the 0 genotype (refers to reference), the index will be -1. * @param requiredVariation the variations required for the export * @param variations the variations defined in the line * @param indexes an array of indexes referring to the variations of the line * @return the list of indexes related to required variations */ private List<Integer> getValidIndexes (List<VariantType> requiredVariation, VariantType[] variations, int[] indexes) { List<Integer> list = new ArrayList<Integer>(); for (int i = 0; i < indexes.length; i++) { int currentIndex = indexes[i]; boolean insert = false; if (currentIndex >= 0) { insert = requiredVariation.contains(variations[indexes[i]]); } if (insert && !list.contains(currentIndex)) { list.add(currentIndex); } } if (list.size() == 0) { if (isReferenceValid(requiredVariation, variations) && (includeReferences || includeNoCall)) { list.add(-1); } } return list; } /** * @param requiredVariation the required variation * @param variations the variation defined in the line * @return true if the line defines at least one of the required variation */ private boolean isReferenceValid (List<VariantType> requiredVariation, VariantType[] variations) { boolean result = false; for (VariantType type: variations) { if (requiredVariation.contains(type)) { result = true; break; } } return result; } /** * Tests the line with all required filters. * @param line the line to test * @return true if the line meet all filters requirements, false otherwise */ private boolean isValid (VCFLine line) { if (filterList != null) { for (MGFilter filter: filterList) { if (!filter.getFilter().isValid(line)) { return false; } } } return true; } /** * @return the vcfFile */ public VCFFile getVcfFile() { return vcfFile; } /** * @return the reader */ public BGZIPReader getReader() { return reader; } /** * @return the allValidIndex */ public List<Integer> getAllValidIndex() { return allValidIndex; } /** * @return the allValidGenome */ public List<String> getAllValidGenome() { return allValidGenome; } }